From 331e264399851585a34d484bfcc073c36bde8ca4 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 24 Nov 2024 22:32:06 +0000 Subject: [PATCH] Add tokenizer --- backend/reconcile/src/diffs/myers.rs | 31 ++++++++----- backend/reconcile/src/diffs/raw_operation.rs | 44 +++++++++++++----- backend/reconcile/src/lib.rs | 3 ++ .../src/operation_transformation/mod.rs | 20 +++++++- backend/reconcile/src/tokenizer/mod.rs | 1 + backend/reconcile/src/tokenizer/token.rs | 46 +++++++++++++++---- .../reconcile/src/tokenizer/word_tokenizer.rs | 7 +++ 7 files changed, 119 insertions(+), 33 deletions(-) create mode 100644 backend/reconcile/src/tokenizer/word_tokenizer.rs diff --git a/backend/reconcile/src/diffs/myers.rs b/backend/reconcile/src/diffs/myers.rs index 445c4606..b9a6d817 100644 --- a/backend/reconcile/src/diffs/myers.rs +++ b/backend/reconcile/src/diffs/myers.rs @@ -18,6 +18,7 @@ //! without making reasonable progress. //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). +use std::hash::Hash; use std::ops::{Index, IndexMut, Range}; use std::vec; @@ -33,11 +34,14 @@ use super::raw_operation::RawOperation; /// /// This diff is done with an optional deadline that defines the maximal /// execution time permitted before it bails and falls back to an approximation. -pub fn diff(old: &[Token], new: &[Token]) -> Vec { +pub fn diff(old: &[Token], new: &[Token]) -> Vec> +where + T: PartialEq + Hash + Clone, +{ let max_d = max_d(old.len(), new.len()); let mut vb = V::new(max_d); let mut vf = V::new(max_d); - let mut result: Vec = vec![]; + let mut result: Vec> = vec![]; conquer( old, 0..old.len(), @@ -118,14 +122,17 @@ fn split_at(range: Range, at: usize) -> (Range, Range) { /// simultaneously run the basic algorithm in both the forward and reverse /// directions until furthest reaching forward and reverse paths starting at /// opposing corners 'overlap'. -fn find_middle_snake( - old: &[Token], +fn find_middle_snake( + old: &[Token], old_range: Range, - new: &[Token], + new: &[Token], new_range: Range, vf: &mut V, vb: &mut V, -) -> Option<(usize, usize)> { +) -> Option<(usize, usize)> +where + T: PartialEq + Hash + Clone, +{ let n = old_range.len(); let m = new_range.len(); @@ -222,15 +229,17 @@ fn find_middle_snake( None } -fn conquer( - old: &[Token], +fn conquer( + old: &[Token], mut old_range: Range, - new: &[Token], + new: &[Token], mut new_range: Range, vf: &mut V, vb: &mut V, - result: &mut Vec, -) { + result: &mut Vec>, +) where + T: PartialEq + Hash + Clone, +{ // Check for common prefix let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); if common_prefix_len > 0 { diff --git a/backend/reconcile/src/diffs/raw_operation.rs b/backend/reconcile/src/diffs/raw_operation.rs index 030f0c7d..f03b2b2f 100644 --- a/backend/reconcile/src/diffs/raw_operation.rs +++ b/backend/reconcile/src/diffs/raw_operation.rs @@ -1,14 +1,21 @@ use crate::tokenizer::token::Token; +use std::hash::Hash; #[derive(Debug, Clone, PartialEq)] -pub enum RawOperation { - Insert(Vec), - Delete(Vec), - Equal(Vec), +pub enum RawOperation +where + T: PartialEq + Hash + Clone, +{ + Insert(Vec>), + Delete(Vec>), + Equal(Vec>), } -impl RawOperation { - pub fn tokens(&self) -> &Vec { +impl RawOperation +where + T: PartialEq + Hash + Clone, +{ + pub fn tokens(&self) -> &Vec> { match self { RawOperation::Insert(tokens) => tokens, RawOperation::Delete(tokens) => tokens, @@ -17,13 +24,28 @@ impl RawOperation { } pub fn original_text_length(&self) -> usize { - self.tokens() - .iter() - .map(|t| t.original.chars().count()) - .sum() + self.tokens().iter().map(|t| t.get_original_length()).sum() } pub fn get_original_text(self) -> String { - self.tokens().iter().map(|t| t.original.clone()).collect() + self.tokens().iter().map(|t| t.original()).collect() + } + + /// Extends the operation with another operation if returning the new operation. + /// Only operations of the same type can be used to extend. If the operations are of different + /// types, returns None. + pub fn extend(self, other: RawOperation) -> Option> { + match (self, other) { + (RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some( + RawOperation::Insert(tokens1.into_iter().chain(tokens2.into_iter()).collect()), + ), + (RawOperation::Delete(tokens1), RawOperation::Delete(tokens2)) => Some( + RawOperation::Delete(tokens1.into_iter().chain(tokens2.into_iter()).collect()), + ), + (RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some( + RawOperation::Equal(tokens1.into_iter().chain(tokens2.into_iter()).collect()), + ), + _ => None, + } } } diff --git a/backend/reconcile/src/lib.rs b/backend/reconcile/src/lib.rs index 64842a88..7bad0a1f 100644 --- a/backend/reconcile/src/lib.rs +++ b/backend/reconcile/src/lib.rs @@ -4,5 +4,8 @@ mod operation_transformation; mod tokenizer; mod utils; +pub use errors::SyncLibError; pub use operation_transformation::reconcile; +pub use operation_transformation::reconcile_with_tokenizer; pub use operation_transformation::EditedText; +pub use tokenizer::token::Token; diff --git a/backend/reconcile/src/operation_transformation/mod.rs b/backend/reconcile/src/operation_transformation/mod.rs index 742905a4..fdf2a32b 100644 --- a/backend/reconcile/src/operation_transformation/mod.rs +++ b/backend/reconcile/src/operation_transformation/mod.rs @@ -4,8 +4,9 @@ mod operation; pub use edited_text::EditedText; pub use operation::Operation; +use std::hash::Hash; -use crate::errors::SyncLibError; +use crate::{errors::SyncLibError, tokenizer::token::Token}; pub fn reconcile(original: &str, left: &str, right: &str) -> Result { let left_operations = EditedText::from_strings(original, left); @@ -15,6 +16,23 @@ pub fn reconcile(original: &str, left: &str, right: &str) -> Result( + original: &str, + left: &str, + right: &str, + tokenizer: &F, +) -> Result +where + F: Fn(&str) -> Vec>, + T: PartialEq + Hash + Clone, +{ + let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer); + let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer); + + let merged_operations = left_operations.merge(right_operations); + merged_operations.apply() +} + #[cfg(test)] mod test { use std::{fs, ops::Range, path::Path}; diff --git a/backend/reconcile/src/tokenizer/mod.rs b/backend/reconcile/src/tokenizer/mod.rs index 79c66ba6..6a3b8b41 100644 --- a/backend/reconcile/src/tokenizer/mod.rs +++ b/backend/reconcile/src/tokenizer/mod.rs @@ -1 +1,2 @@ pub mod token; +pub mod word_tokenizer; diff --git a/backend/reconcile/src/tokenizer/token.rs b/backend/reconcile/src/tokenizer/token.rs index 13cf4c8c..1c998cec 100644 --- a/backend/reconcile/src/tokenizer/token.rs +++ b/backend/reconcile/src/tokenizer/token.rs @@ -1,26 +1,52 @@ +use std::hash::Hash; + #[derive(Debug, Clone)] -pub struct Token { - pub normalised: String, - pub original: String, +pub struct Token +where + T: PartialEq + Hash + Clone, +{ + normalised: T, + original: String, } -impl Token { - pub fn new(normalised: String, original: String) -> Self { +impl Token +where + T: PartialEq + Hash + Clone, +{ + pub fn new(normalised: T, original: String) -> Self { Token { normalised, original, } } - pub fn tokenize(text: &str) -> Vec { - text.split_inclusive(|c: char| c.is_whitespace()) - .map(|s| Token::new(s.to_string(), s.to_string())) - .collect() + pub fn original(&self) -> &str { + &self.original + } + + pub fn normalised(&self) -> &T { + &self.normalised + } + + pub fn get_original_length(&self) -> usize { + self.original.chars().count() } } -impl PartialEq for Token { +impl PartialEq for Token +where + T: PartialEq + Hash + Clone, +{ fn eq(&self, other: &Self) -> bool { self.normalised == other.normalised } } + +impl Hash for Token +where + T: PartialEq + Hash + Clone, +{ + fn hash(&self, state: &mut H) { + self.normalised.hash(state); + } +} diff --git a/backend/reconcile/src/tokenizer/word_tokenizer.rs b/backend/reconcile/src/tokenizer/word_tokenizer.rs new file mode 100644 index 00000000..1e4ac6d3 --- /dev/null +++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs @@ -0,0 +1,7 @@ +use super::token::Token; + +pub fn word_tokenizer(text: &str) -> Vec> { + text.split_inclusive(char::is_whitespace) + .map(|s| Token::new(s.to_string(), s.to_string())) + .collect() +}