Implement makrdown tokeniser

This commit is contained in:
Andras Schmelczer 2026-03-11 20:39:04 +00:00
parent bd2ef79fb1
commit 9a82d6d8dd
2 changed files with 310 additions and 12 deletions

View file

@ -0,0 +1,290 @@
use super::{token::Token, word_tokenizer::split_words};
/// Splits markdown text into tokens that respect markdown formatting structure
///
/// Builds on word-level tokenization with markdown-specific handling:
/// - Newlines are non-joinable tokens (preserves block structure)
/// - Block-level prefixes (headings, list markers, blockquotes) attach to the
/// first word of their line so they can't be split apart during merge
/// - Intra-line whitespace uses the same normalization as the word tokenizer
///
/// This prevents merges from breaking lists, headings, or other structural
/// markdown elements. Inline formatting like `**bold**` is already preserved
/// by word-level splitting since formatting markers contain no whitespace.
///
/// ## Example
///
/// ```not_rust
/// "# Hello\n- item" -> ["# Hello", "\n", "- item"]
/// ```
pub fn markdown_tokenizer(text: &str) -> Vec<Token<String>> {
let mut result = Vec::new();
let segments = split_preserving_newlines(text);
for segment in &segments {
if *segment == "\n" || *segment == "\r\n" {
let s = (*segment).to_owned();
result.push(Token::new(s.clone(), s, false, false));
continue;
}
let prefix_len = block_prefix_len(segment);
let mut line_tokens = split_words(&segment[prefix_len..]);
if prefix_len > 0 {
let prefix = &segment[..prefix_len];
if line_tokens.is_empty() {
let s = prefix.to_owned();
result.push(Token::new(s.clone(), s, false, false));
} else {
let first = &line_tokens[0];
let combined_original = format!("{prefix}{}", first.original());
let combined_normalized = format!("{prefix}{}", first.normalized());
line_tokens[0] = Token::new(
combined_normalized,
combined_original,
false,
first.is_right_joinable,
);
}
}
result.extend(line_tokens);
}
// Normalize non-newline whitespace tokens by appending the next token's
// original text (same trick as the word tokenizer so each space is unique
// in the diff based on what follows it)
if !result.is_empty() {
for i in 0..result.len() - 1 {
if result[i]
.original()
.chars()
.all(|c| c.is_whitespace() && c != '\n' && c != '\r')
{
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
result[i].set_normalized(normalized);
}
}
}
result
}
/// Splits text into alternating segments of line content and newline separators
fn split_preserving_newlines(text: &str) -> Vec<&str> {
let mut segments = Vec::new();
let mut line_start = 0;
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
if i > line_start {
segments.push(&text[line_start..i]);
}
segments.push(&text[i..i + 2]);
i += 2;
line_start = i;
} else if bytes[i] == b'\n' {
if i > line_start {
segments.push(&text[line_start..i]);
}
segments.push(&text[i..=i]);
i += 1;
line_start = i;
} else {
i += 1;
}
}
if line_start < text.len() {
segments.push(&text[line_start..]);
}
segments
}
/// Returns the byte length of a markdown block-level prefix at the start of a
/// line, or 0 if none is found
///
/// All recognized prefix characters are ASCII, so byte offsets are always
/// valid UTF-8 boundaries.
///
/// Recognized prefixes:
/// - ATX headings: `# ` through `###### `
/// - Blockquotes: `> ` (single level)
/// - Unordered lists: `- `, `* `, `+ ` (with optional leading whitespace)
/// - Ordered lists: `1. `, `2) ` etc (with optional leading whitespace)
/// - Task lists: `- [ ] `, `- [x] `, `- [X] ` etc (checkbox included in prefix)
fn block_prefix_len(line: &str) -> usize {
let trimmed = line.trim_start_matches([' ', '\t']);
let indent_len = line.len() - trimmed.len();
// ATX heading: #{1,6} followed by a space
if trimmed.starts_with('#') {
let hash_count = trimmed.bytes().take_while(|&b| b == b'#').count();
if hash_count <= 6 && trimmed.as_bytes().get(hash_count) == Some(&b' ') {
return indent_len + hash_count + 1;
}
}
// Blockquote: > followed by optional space
if trimmed.starts_with("> ") {
return indent_len + 2;
}
if trimmed.starts_with('>') && (trimmed.len() == 1 || trimmed.as_bytes()[1] == b'>') {
return indent_len + 1;
}
// Unordered list: [-*+] followed by a space, optionally with task checkbox
if trimmed.len() >= 2 {
let first_byte = trimmed.as_bytes()[0];
if matches!(first_byte, b'-' | b'*' | b'+') && trimmed.as_bytes()[1] == b' ' {
return indent_len + 2 + task_checkbox_len(&line[indent_len + 2..]);
}
}
// Ordered list: digits followed by [.)] and a space, optionally with task
// checkbox
let digit_count = trimmed.bytes().take_while(u8::is_ascii_digit).count();
if digit_count > 0 && indent_len + digit_count + 2 <= line.len() {
let after_digits = trimmed.as_bytes()[digit_count];
let after_marker = trimmed.as_bytes().get(digit_count + 1);
if matches!(after_digits, b'.' | b')') && after_marker == Some(&b' ') {
return indent_len
+ digit_count
+ 2
+ task_checkbox_len(&line[indent_len + digit_count + 2..]);
}
}
0
}
/// Returns the byte length of a task list checkbox (`[ ] `, `[x] `, `[X] `)
/// at the start of `rest`, or 0 if none is found
fn task_checkbox_len(rest: &str) -> usize {
if rest.len() >= 4
&& rest.as_bytes()[0] == b'['
&& matches!(rest.as_bytes()[1], b' ' | b'x' | b'X')
&& rest.as_bytes()[2] == b']'
&& rest.as_bytes()[3] == b' '
{
4
} else {
0
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
#[test]
fn test_plain_text() {
assert_debug_snapshot!(markdown_tokenizer("Hello world"));
}
#[test]
fn test_empty() {
assert_debug_snapshot!(markdown_tokenizer(""));
}
#[test]
fn test_headings() {
assert_debug_snapshot!(markdown_tokenizer("# Hello world"));
assert_debug_snapshot!(markdown_tokenizer("## Sub heading"));
assert_debug_snapshot!(markdown_tokenizer("###### Deep heading"));
}
#[test]
fn test_unordered_list() {
assert_debug_snapshot!(markdown_tokenizer("- item one\n- item two\n- item three"));
}
#[test]
fn test_ordered_list() {
assert_debug_snapshot!(markdown_tokenizer("1. first\n2. second\n3. third"));
}
#[test]
fn test_blockquote() {
assert_debug_snapshot!(markdown_tokenizer("> quoted text\n> more quoted"));
}
#[test]
fn test_inline_formatting() {
assert_debug_snapshot!(markdown_tokenizer("Some **bold** and *italic* text"));
}
#[test]
fn test_mixed_content() {
assert_debug_snapshot!(markdown_tokenizer(
"# Title\n\nSome text with **bold**.\n\n- list item\n- another item"
));
}
#[test]
fn test_indented_list() {
assert_debug_snapshot!(markdown_tokenizer(" - nested item\n - deeper"));
}
#[test]
fn test_crlf() {
assert_debug_snapshot!(markdown_tokenizer("Line 1\r\nLine 2"));
}
#[test]
fn test_code_fence() {
assert_debug_snapshot!(markdown_tokenizer("```rust\nlet x = 1;\n```"));
}
#[test]
fn test_heading_only() {
assert_debug_snapshot!(markdown_tokenizer("# "));
}
#[test]
fn test_link() {
assert_debug_snapshot!(markdown_tokenizer("Click [here](https://example.com) now"));
}
#[test]
fn test_multiline_paragraph() {
assert_debug_snapshot!(markdown_tokenizer(
"First line\nSecond line\n\nNew paragraph"
));
}
#[test]
fn test_list_with_star_marker() {
assert_debug_snapshot!(markdown_tokenizer("* item one\n* item two"));
}
#[test]
fn test_bold_not_confused_with_list() {
assert_debug_snapshot!(markdown_tokenizer("**bold text**"));
}
#[test]
fn test_task_list() {
assert_debug_snapshot!(markdown_tokenizer(
"- [ ] todo\n- [x] done\n- [X] also done"
));
}
#[test]
fn test_ordered_task_list() {
assert_debug_snapshot!(markdown_tokenizer("1. [ ] first task\n2. [x] second task"));
}
#[test]
fn test_unicode() {
assert_debug_snapshot!(markdown_tokenizer(
"# \u{1F600} Héllo\n- \u{00E9}lément\n> \u{4F60}\u{597D} world"
));
}
}

View file

@ -9,6 +9,26 @@ use super::token::Token;
/// "Hi there!" -> ["Hi", " ", "there!"]
/// ```
pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
let mut result = split_words(text);
if result.is_empty() {
return result;
}
// normalize whitespace tokens by concatenating with the following token
for i in 0..result.len() - 1 {
if result[i].original().chars().all(char::is_whitespace) {
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
result[i].set_normalized(normalized);
}
}
result
}
/// Splits text into alternating word and whitespace tokens without any
/// normalization. Shared by `word_tokenizer` and `markdown_tokenizer`.
pub(super) fn split_words(text: &str) -> Vec<Token<String>> {
let mut result = Vec::new();
let mut previous_boundary_index = 0;
@ -28,18 +48,6 @@ pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
result.push(text[previous_boundary_index..].into());
}
if result.is_empty() {
return result;
}
// normalize whitespace tokens by concatenating with the following token
for i in 0..result.len() - 1 {
if result[i].original().chars().all(char::is_whitespace) {
let normalized = result[i].normalized().to_owned() + result[i + 1].original();
result[i].set_normalized(normalized);
}
}
result
}