Add tokenizer

This commit is contained in:
Andras Schmelczer 2024-11-24 22:32:06 +00:00
parent e910d9c5f4
commit 331e264399
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
7 changed files with 119 additions and 33 deletions

View file

@ -18,6 +18,7 @@
//! without making reasonable progress. //! without making reasonable progress.
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
use std::hash::Hash;
use std::ops::{Index, IndexMut, Range}; use std::ops::{Index, IndexMut, Range};
use std::vec; use std::vec;
@ -33,11 +34,14 @@ use super::raw_operation::RawOperation;
/// ///
/// This diff is done with an optional deadline that defines the maximal /// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation. /// execution time permitted before it bails and falls back to an approximation.
pub fn diff(old: &[Token], new: &[Token]) -> Vec<RawOperation> { pub fn diff<T>(old: &[Token<T>], new: &[Token<T>]) -> Vec<RawOperation<T>>
where
T: PartialEq + Hash + Clone,
{
let max_d = max_d(old.len(), new.len()); let max_d = max_d(old.len(), new.len());
let mut vb = V::new(max_d); let mut vb = V::new(max_d);
let mut vf = V::new(max_d); let mut vf = V::new(max_d);
let mut result: Vec<RawOperation> = vec![]; let mut result: Vec<RawOperation<T>> = vec![];
conquer( conquer(
old, old,
0..old.len(), 0..old.len(),
@ -118,14 +122,17 @@ fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
/// simultaneously run the basic algorithm in both the forward and reverse /// simultaneously run the basic algorithm in both the forward and reverse
/// directions until furthest reaching forward and reverse paths starting at /// directions until furthest reaching forward and reverse paths starting at
/// opposing corners 'overlap'. /// opposing corners 'overlap'.
fn find_middle_snake( fn find_middle_snake<T>(
old: &[Token], old: &[Token<T>],
old_range: Range<usize>, old_range: Range<usize>,
new: &[Token], new: &[Token<T>],
new_range: Range<usize>, new_range: Range<usize>,
vf: &mut V, vf: &mut V,
vb: &mut V, vb: &mut V,
) -> Option<(usize, usize)> { ) -> Option<(usize, usize)>
where
T: PartialEq + Hash + Clone,
{
let n = old_range.len(); let n = old_range.len();
let m = new_range.len(); let m = new_range.len();
@ -222,15 +229,17 @@ fn find_middle_snake(
None None
} }
fn conquer( fn conquer<T>(
old: &[Token], old: &[Token<T>],
mut old_range: Range<usize>, mut old_range: Range<usize>,
new: &[Token], new: &[Token<T>],
mut new_range: Range<usize>, mut new_range: Range<usize>,
vf: &mut V, vf: &mut V,
vb: &mut V, vb: &mut V,
result: &mut Vec<RawOperation>, result: &mut Vec<RawOperation<T>>,
) { ) where
T: PartialEq + Hash + Clone,
{
// Check for common prefix // Check for common prefix
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
if common_prefix_len > 0 { if common_prefix_len > 0 {

View file

@ -1,14 +1,21 @@
use crate::tokenizer::token::Token; use crate::tokenizer::token::Token;
use std::hash::Hash;
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum RawOperation { pub enum RawOperation<T>
Insert(Vec<Token>), where
Delete(Vec<Token>), T: PartialEq + Hash + Clone,
Equal(Vec<Token>), {
Insert(Vec<Token<T>>),
Delete(Vec<Token<T>>),
Equal(Vec<Token<T>>),
} }
impl RawOperation { impl<T> RawOperation<T>
pub fn tokens(&self) -> &Vec<Token> { where
T: PartialEq + Hash + Clone,
{
pub fn tokens(&self) -> &Vec<Token<T>> {
match self { match self {
RawOperation::Insert(tokens) => tokens, RawOperation::Insert(tokens) => tokens,
RawOperation::Delete(tokens) => tokens, RawOperation::Delete(tokens) => tokens,
@ -17,13 +24,28 @@ impl RawOperation {
} }
pub fn original_text_length(&self) -> usize { pub fn original_text_length(&self) -> usize {
self.tokens() self.tokens().iter().map(|t| t.get_original_length()).sum()
.iter()
.map(|t| t.original.chars().count())
.sum()
} }
pub fn get_original_text(self) -> String { pub fn get_original_text(self) -> String {
self.tokens().iter().map(|t| t.original.clone()).collect() self.tokens().iter().map(|t| t.original()).collect()
}
/// Extends the operation with another operation if returning the new operation.
/// Only operations of the same type can be used to extend. If the operations are of different
/// types, returns None.
pub fn extend(self, other: RawOperation<T>) -> Option<RawOperation<T>> {
match (self, other) {
(RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some(
RawOperation::Insert(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
),
(RawOperation::Delete(tokens1), RawOperation::Delete(tokens2)) => Some(
RawOperation::Delete(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
),
(RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some(
RawOperation::Equal(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
),
_ => None,
}
} }
} }

View file

@ -4,5 +4,8 @@ mod operation_transformation;
mod tokenizer; mod tokenizer;
mod utils; mod utils;
pub use errors::SyncLibError;
pub use operation_transformation::reconcile; pub use operation_transformation::reconcile;
pub use operation_transformation::reconcile_with_tokenizer;
pub use operation_transformation::EditedText; pub use operation_transformation::EditedText;
pub use tokenizer::token::Token;

View file

@ -4,8 +4,9 @@ mod operation;
pub use edited_text::EditedText; pub use edited_text::EditedText;
pub use operation::Operation; pub use operation::Operation;
use std::hash::Hash;
use crate::errors::SyncLibError; use crate::{errors::SyncLibError, tokenizer::token::Token};
pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, SyncLibError> { pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, SyncLibError> {
let left_operations = EditedText::from_strings(original, left); let left_operations = EditedText::from_strings(original, left);
@ -15,6 +16,23 @@ pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, Sync
merged_operations.apply() merged_operations.apply()
} }
pub fn reconcile_with_tokenizer<F, T>(
original: &str,
left: &str,
right: &str,
tokenizer: &F,
) -> Result<String, SyncLibError>
where
F: Fn(&str) -> Vec<Token<T>>,
T: PartialEq + Hash + Clone,
{
let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
let merged_operations = left_operations.merge(right_operations);
merged_operations.apply()
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use std::{fs, ops::Range, path::Path}; use std::{fs, ops::Range, path::Path};

View file

@ -1 +1,2 @@
pub mod token; pub mod token;
pub mod word_tokenizer;

View file

@ -1,26 +1,52 @@
use std::hash::Hash;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct Token { pub struct Token<T>
pub normalised: String, where
pub original: String, T: PartialEq + Hash + Clone,
{
normalised: T,
original: String,
} }
impl Token { impl<T> Token<T>
pub fn new(normalised: String, original: String) -> Self { where
T: PartialEq + Hash + Clone,
{
pub fn new(normalised: T, original: String) -> Self {
Token { Token {
normalised, normalised,
original, original,
} }
} }
pub fn tokenize(text: &str) -> Vec<Token> { pub fn original(&self) -> &str {
text.split_inclusive(|c: char| c.is_whitespace()) &self.original
.map(|s| Token::new(s.to_string(), s.to_string())) }
.collect()
pub fn normalised(&self) -> &T {
&self.normalised
}
pub fn get_original_length(&self) -> usize {
self.original.chars().count()
} }
} }
impl PartialEq for Token { impl<T> PartialEq for Token<T>
where
T: PartialEq + Hash + Clone,
{
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.normalised == other.normalised self.normalised == other.normalised
} }
} }
impl<T> Hash for Token<T>
where
T: PartialEq + Hash + Clone,
{
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.normalised.hash(state);
}
}

View file

@ -0,0 +1,7 @@
use super::token::Token;
pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
text.split_inclusive(char::is_whitespace)
.map(|s| Token::new(s.to_string(), s.to_string()))
.collect()
}