Add left/right joinability for tokens

This commit is contained in:
Andras Schmelczer 2025-04-05 13:48:02 +01:00
parent b0c6c082a1
commit b230d34b88
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
13 changed files with 313 additions and 75 deletions

View file

@ -5,11 +5,21 @@ snapshot_kind: text
---
[
Token {
normalised: "what?",
original: " what?",
normalised: " what?",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "",
normalised: "what?",
original: "what?",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -4,20 +4,52 @@ expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: " hello,",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "hello,",
original: " hello,",
original: "hello,",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " \nwhere",
original: " \n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "where",
original: " \nwhere",
original: "where",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " are",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "are",
original: " are",
original: "are",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " you?",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "you?",
original: " you?",
original: "you?",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,39 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "hello,",
original: "hello,",
},
Token {
normalised: " \n",
original: " \n",
},
Token {
normalised: "where",
original: "where",
},
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "are",
original: "are",
},
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "you?",
original: "you?",
},
]

View file

@ -7,9 +7,19 @@ snapshot_kind: text
Token {
normalised: "Hi",
original: "Hi",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " there!",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "there!",
original: " there!",
original: "there!",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -3,29 +3,45 @@ use serde::{Deserialize, Serialize};
/// A token is a string that has been normalised in some way.
/// The normalised form is used for comparison, while the original form is used
/// for applying Operations.
/// for applying `Operation`-s.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
normalised: T,
/// The normalised form of the token used deriving the diff.
pub normalised: T,
/// The original string, that should be inserted or deleted in the document.
original: String,
/// Whether the token is joinable with the previous token.
is_left_joinable: bool,
/// Whether the token is joinable with the next token.
is_right_joinable: bool,
}
impl From<&str> for Token<String> {
fn from(s: &str) -> Self { Token::new(s.trim().to_owned(), s.to_owned()) }
fn from(text: &str) -> Self { Token::new(text.to_owned(), text.to_owned(), true, true) }
}
impl<T> Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
pub fn new(normalised: T, original: String) -> Self {
pub fn new(
normalised: T,
original: String,
is_left_joinable: bool,
is_right_joinable: bool,
) -> Self {
Token {
normalised,
original,
is_left_joinable,
is_right_joinable,
}
}
@ -34,6 +50,10 @@ where
pub fn normalised(&self) -> &T { &self.normalised }
pub fn get_original_length(&self) -> usize { self.original.chars().count() }
pub fn get_is_left_joinable(&self) -> bool { self.is_left_joinable }
pub fn get_is_right_joinable(&self) -> bool { self.is_right_joinable }
}
impl<T> PartialEq for Token<T>