From 331e264399851585a34d484bfcc073c36bde8ca4 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer <andras@schmelczer.dev>
Date: Sun, 24 Nov 2024 22:32:06 +0000
Subject: [PATCH] Add tokenizer

---
 backend/reconcile/src/diffs/myers.rs          | 31 ++++++++-----
 backend/reconcile/src/diffs/raw_operation.rs  | 44 +++++++++++++-----
 backend/reconcile/src/lib.rs                  |  3 ++
 .../src/operation_transformation/mod.rs       | 20 +++++++-
 backend/reconcile/src/tokenizer/mod.rs        |  1 +
 backend/reconcile/src/tokenizer/token.rs      | 46 +++++++++++++++----
 .../reconcile/src/tokenizer/word_tokenizer.rs |  7 +++
 7 files changed, 119 insertions(+), 33 deletions(-)
 create mode 100644 backend/reconcile/src/tokenizer/word_tokenizer.rs
diff --git a/backend/reconcile/src/diffs/myers.rs b/backend/reconcile/src/diffs/myers.rs
index 445c4606..b9a6d817 100644
--- a/backend/reconcile/src/diffs/myers.rs
+++ b/backend/reconcile/src/diffs/myers.rs
@@ -18,6 +18,7 @@
 //! without making reasonable progress.
 //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
 
+use std::hash::Hash;
 use std::ops::{Index, IndexMut, Range};
 use std::vec;
 
@@ -33,11 +34,14 @@ use super::raw_operation::RawOperation;
 ///
 /// This diff is done with an optional deadline that defines the maximal
 /// execution time permitted before it bails and falls back to an approximation.
-pub fn diff(old: &[Token], new: &[Token]) -> Vec<RawOperation> {
+pub fn diff<T>(old: &[Token<T>], new: &[Token<T>]) -> Vec<RawOperation<T>>
+where
+    T: PartialEq + Hash + Clone,
+{
     let max_d = max_d(old.len(), new.len());
     let mut vb = V::new(max_d);
     let mut vf = V::new(max_d);
-    let mut result: Vec<RawOperation> = vec![];
+    let mut result: Vec<RawOperation<T>> = vec![];
     conquer(
         old,
         0..old.len(),
@@ -118,14 +122,17 @@ fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
 /// simultaneously run the basic algorithm in both the forward and reverse
 /// directions until furthest reaching forward and reverse paths starting at
 /// opposing corners 'overlap'.
-fn find_middle_snake(
-    old: &[Token],
+fn find_middle_snake<T>(
+    old: &[Token<T>],
     old_range: Range<usize>,
-    new: &[Token],
+    new: &[Token<T>],
     new_range: Range<usize>,
     vf: &mut V,
     vb: &mut V,
-) -> Option<(usize, usize)> {
+) -> Option<(usize, usize)>
+where
+    T: PartialEq + Hash + Clone,
+{
     let n = old_range.len();
     let m = new_range.len();
 
@@ -222,15 +229,17 @@ fn find_middle_snake(
     None
 }
 
-fn conquer(
-    old: &[Token],
+fn conquer<T>(
+    old: &[Token<T>],
     mut old_range: Range<usize>,
-    new: &[Token],
+    new: &[Token<T>],
     mut new_range: Range<usize>,
     vf: &mut V,
     vb: &mut V,
-    result: &mut Vec<RawOperation>,
-) {
+    result: &mut Vec<RawOperation<T>>,
+) where
+    T: PartialEq + Hash + Clone,
+{
     // Check for common prefix
     let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
     if common_prefix_len > 0 {
diff --git a/backend/reconcile/src/diffs/raw_operation.rs b/backend/reconcile/src/diffs/raw_operation.rs
index 030f0c7d..f03b2b2f 100644
--- a/backend/reconcile/src/diffs/raw_operation.rs
+++ b/backend/reconcile/src/diffs/raw_operation.rs
@@ -1,14 +1,21 @@
 use crate::tokenizer::token::Token;
+use std::hash::Hash;
 
 #[derive(Debug, Clone, PartialEq)]
-pub enum RawOperation {
-    Insert(Vec<Token>),
-    Delete(Vec<Token>),
-    Equal(Vec<Token>),
+pub enum RawOperation<T>
+where
+    T: PartialEq + Hash + Clone,
+{
+    Insert(Vec<Token<T>>),
+    Delete(Vec<Token<T>>),
+    Equal(Vec<Token<T>>),
 }
 
-impl RawOperation {
-    pub fn tokens(&self) -> &Vec<Token> {
+impl<T> RawOperation<T>
+where
+    T: PartialEq + Hash + Clone,
+{
+    pub fn tokens(&self) -> &Vec<Token<T>> {
         match self {
             RawOperation::Insert(tokens) => tokens,
             RawOperation::Delete(tokens) => tokens,
@@ -17,13 +24,28 @@ impl RawOperation {
     }
 
     pub fn original_text_length(&self) -> usize {
-        self.tokens()
-            .iter()
-            .map(|t| t.original.chars().count())
-            .sum()
+        self.tokens().iter().map(|t| t.get_original_length()).sum()
     }
 
     pub fn get_original_text(self) -> String {
-        self.tokens().iter().map(|t| t.original.clone()).collect()
+        self.tokens().iter().map(|t| t.original()).collect()
+    }
+
+    /// Extends the operation with another operation if returning the new operation.
+    /// Only operations of the same type can be used to extend. If the operations are of different
+    /// types, returns None.
+    pub fn extend(self, other: RawOperation<T>) -> Option<RawOperation<T>> {
+        match (self, other) {
+            (RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some(
+                RawOperation::Insert(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
+            ),
+            (RawOperation::Delete(tokens1), RawOperation::Delete(tokens2)) => Some(
+                RawOperation::Delete(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
+            ),
+            (RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some(
+                RawOperation::Equal(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
+            ),
+            _ => None,
+        }
     }
 }
diff --git a/backend/reconcile/src/lib.rs b/backend/reconcile/src/lib.rs
index 64842a88..7bad0a1f 100644
--- a/backend/reconcile/src/lib.rs
+++ b/backend/reconcile/src/lib.rs
@@ -4,5 +4,8 @@ mod operation_transformation;
 mod tokenizer;
 mod utils;
 
+pub use errors::SyncLibError;
 pub use operation_transformation::reconcile;
+pub use operation_transformation::reconcile_with_tokenizer;
 pub use operation_transformation::EditedText;
+pub use tokenizer::token::Token;
diff --git a/backend/reconcile/src/operation_transformation/mod.rs b/backend/reconcile/src/operation_transformation/mod.rs
index 742905a4..fdf2a32b 100644
--- a/backend/reconcile/src/operation_transformation/mod.rs
+++ b/backend/reconcile/src/operation_transformation/mod.rs
@@ -4,8 +4,9 @@ mod operation;
 
 pub use edited_text::EditedText;
 pub use operation::Operation;
+use std::hash::Hash;
 
-use crate::errors::SyncLibError;
+use crate::{errors::SyncLibError, tokenizer::token::Token};
 
 pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, SyncLibError> {
     let left_operations = EditedText::from_strings(original, left);
@@ -15,6 +16,23 @@ pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, Sync
     merged_operations.apply()
 }
 
+pub fn reconcile_with_tokenizer<F, T>(
+    original: &str,
+    left: &str,
+    right: &str,
+    tokenizer: &F,
+) -> Result<String, SyncLibError>
+where
+    F: Fn(&str) -> Vec<Token<T>>,
+    T: PartialEq + Hash + Clone,
+{
+    let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
+    let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
+
+    let merged_operations = left_operations.merge(right_operations);
+    merged_operations.apply()
+}
+
 #[cfg(test)]
 mod test {
     use std::{fs, ops::Range, path::Path};
diff --git a/backend/reconcile/src/tokenizer/mod.rs b/backend/reconcile/src/tokenizer/mod.rs
index 79c66ba6..6a3b8b41 100644
--- a/backend/reconcile/src/tokenizer/mod.rs
+++ b/backend/reconcile/src/tokenizer/mod.rs
@@ -1 +1,2 @@
 pub mod token;
+pub mod word_tokenizer;
diff --git a/backend/reconcile/src/tokenizer/token.rs b/backend/reconcile/src/tokenizer/token.rs
index 13cf4c8c..1c998cec 100644
--- a/backend/reconcile/src/tokenizer/token.rs
+++ b/backend/reconcile/src/tokenizer/token.rs
@@ -1,26 +1,52 @@
+use std::hash::Hash;
+
 #[derive(Debug, Clone)]
-pub struct Token {
-    pub normalised: String,
-    pub original: String,
+pub struct Token<T>
+where
+    T: PartialEq + Hash + Clone,
+{
+    normalised: T,
+    original: String,
 }
 
-impl Token {
-    pub fn new(normalised: String, original: String) -> Self {
+impl<T> Token<T>
+where
+    T: PartialEq + Hash + Clone,
+{
+    pub fn new(normalised: T, original: String) -> Self {
         Token {
             normalised,
             original,
         }
     }
 
-    pub fn tokenize(text: &str) -> Vec<Token> {
-        text.split_inclusive(|c: char| c.is_whitespace())
-            .map(|s| Token::new(s.to_string(), s.to_string()))
-            .collect()
+    pub fn original(&self) -> &str {
+        &self.original
+    }
+
+    pub fn normalised(&self) -> &T {
+        &self.normalised
+    }
+
+    pub fn get_original_length(&self) -> usize {
+        self.original.chars().count()
     }
 }
 
-impl PartialEq for Token {
+impl<T> PartialEq for Token<T>
+where
+    T: PartialEq + Hash + Clone,
+{
     fn eq(&self, other: &Self) -> bool {
         self.normalised == other.normalised
     }
 }
+
+impl<T> Hash for Token<T>
+where
+    T: PartialEq + Hash + Clone,
+{
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.normalised.hash(state);
+    }
+}
diff --git a/backend/reconcile/src/tokenizer/word_tokenizer.rs b/backend/reconcile/src/tokenizer/word_tokenizer.rs
new file mode 100644
index 00000000..1e4ac6d3
--- /dev/null
+++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs
@@ -0,0 +1,7 @@
+use super::token::Token;
+
+pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
+    text.split_inclusive(char::is_whitespace)
+        .map(|s| Token::new(s.to_string(), s.to_string()))
+        .collect()
+}