Merge crates

This commit is contained in:
Andras Schmelczer 2025-06-15 11:30:07 +01:00
parent 82e77eec89
commit bcbac03228
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
60 changed files with 73 additions and 248 deletions

View file

@ -0,0 +1,6 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\"\")"
snapshot_kind: text
---
[]

View file

@ -0,0 +1,25 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" what? \")"
snapshot_kind: text
---
[
Token {
normalised: " what?",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "what?",
original: "what?",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,55 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: " hello,",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "hello,",
original: "hello,",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " \nwhere",
original: " \n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "where",
original: "where",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " are",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "are",
original: "are",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " you?",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "you?",
original: "you?",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,39 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "hello,",
original: "hello,",
},
Token {
normalised: " \n",
original: " \n",
},
Token {
normalised: "where",
original: "where",
},
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "are",
original: "are",
},
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "you?",
original: "you?",
},
]

View file

@ -0,0 +1,25 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\"Hi there!\")"
snapshot_kind: text
---
[
Token {
normalised: "Hi",
original: "Hi",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " there!",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "there!",
original: "there!",
is_left_joinable: true,
is_right_joinable: true,
},
]

68
src/tokenizer/token.rs Normal file
View file

@ -0,0 +1,68 @@
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
/// A token is a string that has been normalised in some way.
///
/// A token consists of the normalised form is used for comparison, and the
/// original form used for subsequently applying `Operation`-s to a text
/// document.
///
/// It's UTF-8 compatible.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
/// The normalised form of the token used deriving the diff.
normalised: T,
/// The original string, that should be inserted or deleted in the document.
original: String,
/// Whether the token is semantically joinable with the previous token.
pub is_left_joinable: bool,
/// Whether the token is semantically joinable with the next token.
pub is_right_joinable: bool,
}
/// Trivial implementation of Token when the normalised form is the same as the
/// original string.
impl From<&str> for Token<String> {
fn from(text: &str) -> Self { Token::new(text.to_owned(), text.to_owned(), true, true) }
}
impl<T> Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
pub fn new(
normalised: T,
original: String,
is_left_joinable: bool,
is_right_joinable: bool,
) -> Self {
Token {
normalised,
original,
is_left_joinable,
is_right_joinable,
}
}
pub fn original(&self) -> &str { &self.original }
pub fn set_normalised(&mut self, normalised: T) { self.normalised = normalised; }
pub fn normalised(&self) -> &T { &self.normalised }
pub fn get_original_length(&self) -> usize { self.original.chars().count() }
}
impl<T> PartialEq for Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
fn eq(&self, other: &Self) -> bool { self.normalised == other.normalised }
}

View file

@ -0,0 +1,61 @@
use super::token::Token;
/// Splits text on word boundaries creating tokens of alternating words and
/// whitespaces with the whitespaces getting unique IDs.
///
/// ## Example
///
/// ```not_rust
/// "Hi there!" -> ["Hi", " ", "there!"]
/// ```
pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
let mut result: Vec<Token<String>> = Vec::new();
let mut previous_boundary_index = 0;
let mut previous_char_is_whitespace = text.chars().next().is_none_or(char::is_whitespace);
for (i, c) in text.char_indices() {
let is_current_char_whitespace = c.is_whitespace();
if previous_char_is_whitespace != is_current_char_whitespace {
result.push(text[previous_boundary_index..i].into());
previous_boundary_index = i;
}
previous_char_is_whitespace = is_current_char_whitespace;
}
if previous_boundary_index < text.len() {
result.push(text[previous_boundary_index..].into());
}
if result.is_empty() {
return result;
}
for i in 0..result.len() - 1 {
if result[i].original().chars().all(char::is_whitespace) {
let normalised = result[i].normalised().to_owned() + result[i + 1].original();
result[i].set_normalised(normalised);
}
}
result
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use super::*;
#[test]
fn test_with_snapshots() {
assert_debug_snapshot!(word_tokenizer("Hi there!"));
assert_debug_snapshot!(word_tokenizer(""));
assert_debug_snapshot!(word_tokenizer(" what? "));
assert_debug_snapshot!(word_tokenizer(" hello, \nwhere are you?"));
}
}