Add tokenizer
This commit is contained in:
parent
e910d9c5f4
commit
331e264399
7 changed files with 119 additions and 33 deletions
|
|
@ -18,6 +18,7 @@
|
||||||
//! without making reasonable progress.
|
//! without making reasonable progress.
|
||||||
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
|
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
|
||||||
|
|
||||||
|
use std::hash::Hash;
|
||||||
use std::ops::{Index, IndexMut, Range};
|
use std::ops::{Index, IndexMut, Range};
|
||||||
use std::vec;
|
use std::vec;
|
||||||
|
|
||||||
|
|
@ -33,11 +34,14 @@ use super::raw_operation::RawOperation;
|
||||||
///
|
///
|
||||||
/// This diff is done with an optional deadline that defines the maximal
|
/// This diff is done with an optional deadline that defines the maximal
|
||||||
/// execution time permitted before it bails and falls back to an approximation.
|
/// execution time permitted before it bails and falls back to an approximation.
|
||||||
pub fn diff(old: &[Token], new: &[Token]) -> Vec<RawOperation> {
|
pub fn diff<T>(old: &[Token<T>], new: &[Token<T>]) -> Vec<RawOperation<T>>
|
||||||
|
where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
let max_d = max_d(old.len(), new.len());
|
let max_d = max_d(old.len(), new.len());
|
||||||
let mut vb = V::new(max_d);
|
let mut vb = V::new(max_d);
|
||||||
let mut vf = V::new(max_d);
|
let mut vf = V::new(max_d);
|
||||||
let mut result: Vec<RawOperation> = vec![];
|
let mut result: Vec<RawOperation<T>> = vec![];
|
||||||
conquer(
|
conquer(
|
||||||
old,
|
old,
|
||||||
0..old.len(),
|
0..old.len(),
|
||||||
|
|
@ -118,14 +122,17 @@ fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
|
||||||
/// simultaneously run the basic algorithm in both the forward and reverse
|
/// simultaneously run the basic algorithm in both the forward and reverse
|
||||||
/// directions until furthest reaching forward and reverse paths starting at
|
/// directions until furthest reaching forward and reverse paths starting at
|
||||||
/// opposing corners 'overlap'.
|
/// opposing corners 'overlap'.
|
||||||
fn find_middle_snake(
|
fn find_middle_snake<T>(
|
||||||
old: &[Token],
|
old: &[Token<T>],
|
||||||
old_range: Range<usize>,
|
old_range: Range<usize>,
|
||||||
new: &[Token],
|
new: &[Token<T>],
|
||||||
new_range: Range<usize>,
|
new_range: Range<usize>,
|
||||||
vf: &mut V,
|
vf: &mut V,
|
||||||
vb: &mut V,
|
vb: &mut V,
|
||||||
) -> Option<(usize, usize)> {
|
) -> Option<(usize, usize)>
|
||||||
|
where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
let n = old_range.len();
|
let n = old_range.len();
|
||||||
let m = new_range.len();
|
let m = new_range.len();
|
||||||
|
|
||||||
|
|
@ -222,15 +229,17 @@ fn find_middle_snake(
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
fn conquer(
|
fn conquer<T>(
|
||||||
old: &[Token],
|
old: &[Token<T>],
|
||||||
mut old_range: Range<usize>,
|
mut old_range: Range<usize>,
|
||||||
new: &[Token],
|
new: &[Token<T>],
|
||||||
mut new_range: Range<usize>,
|
mut new_range: Range<usize>,
|
||||||
vf: &mut V,
|
vf: &mut V,
|
||||||
vb: &mut V,
|
vb: &mut V,
|
||||||
result: &mut Vec<RawOperation>,
|
result: &mut Vec<RawOperation<T>>,
|
||||||
) {
|
) where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
// Check for common prefix
|
// Check for common prefix
|
||||||
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
|
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
|
||||||
if common_prefix_len > 0 {
|
if common_prefix_len > 0 {
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,21 @@
|
||||||
use crate::tokenizer::token::Token;
|
use crate::tokenizer::token::Token;
|
||||||
|
use std::hash::Hash;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub enum RawOperation {
|
pub enum RawOperation<T>
|
||||||
Insert(Vec<Token>),
|
where
|
||||||
Delete(Vec<Token>),
|
T: PartialEq + Hash + Clone,
|
||||||
Equal(Vec<Token>),
|
{
|
||||||
|
Insert(Vec<Token<T>>),
|
||||||
|
Delete(Vec<Token<T>>),
|
||||||
|
Equal(Vec<Token<T>>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RawOperation {
|
impl<T> RawOperation<T>
|
||||||
pub fn tokens(&self) -> &Vec<Token> {
|
where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
|
pub fn tokens(&self) -> &Vec<Token<T>> {
|
||||||
match self {
|
match self {
|
||||||
RawOperation::Insert(tokens) => tokens,
|
RawOperation::Insert(tokens) => tokens,
|
||||||
RawOperation::Delete(tokens) => tokens,
|
RawOperation::Delete(tokens) => tokens,
|
||||||
|
|
@ -17,13 +24,28 @@ impl RawOperation {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn original_text_length(&self) -> usize {
|
pub fn original_text_length(&self) -> usize {
|
||||||
self.tokens()
|
self.tokens().iter().map(|t| t.get_original_length()).sum()
|
||||||
.iter()
|
|
||||||
.map(|t| t.original.chars().count())
|
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_original_text(self) -> String {
|
pub fn get_original_text(self) -> String {
|
||||||
self.tokens().iter().map(|t| t.original.clone()).collect()
|
self.tokens().iter().map(|t| t.original()).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extends the operation with another operation if returning the new operation.
|
||||||
|
/// Only operations of the same type can be used to extend. If the operations are of different
|
||||||
|
/// types, returns None.
|
||||||
|
pub fn extend(self, other: RawOperation<T>) -> Option<RawOperation<T>> {
|
||||||
|
match (self, other) {
|
||||||
|
(RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some(
|
||||||
|
RawOperation::Insert(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
|
||||||
|
),
|
||||||
|
(RawOperation::Delete(tokens1), RawOperation::Delete(tokens2)) => Some(
|
||||||
|
RawOperation::Delete(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
|
||||||
|
),
|
||||||
|
(RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some(
|
||||||
|
RawOperation::Equal(tokens1.into_iter().chain(tokens2.into_iter()).collect()),
|
||||||
|
),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,5 +4,8 @@ mod operation_transformation;
|
||||||
mod tokenizer;
|
mod tokenizer;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
||||||
|
pub use errors::SyncLibError;
|
||||||
pub use operation_transformation::reconcile;
|
pub use operation_transformation::reconcile;
|
||||||
|
pub use operation_transformation::reconcile_with_tokenizer;
|
||||||
pub use operation_transformation::EditedText;
|
pub use operation_transformation::EditedText;
|
||||||
|
pub use tokenizer::token::Token;
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,9 @@ mod operation;
|
||||||
|
|
||||||
pub use edited_text::EditedText;
|
pub use edited_text::EditedText;
|
||||||
pub use operation::Operation;
|
pub use operation::Operation;
|
||||||
|
use std::hash::Hash;
|
||||||
|
|
||||||
use crate::errors::SyncLibError;
|
use crate::{errors::SyncLibError, tokenizer::token::Token};
|
||||||
|
|
||||||
pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, SyncLibError> {
|
pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, SyncLibError> {
|
||||||
let left_operations = EditedText::from_strings(original, left);
|
let left_operations = EditedText::from_strings(original, left);
|
||||||
|
|
@ -15,6 +16,23 @@ pub fn reconcile(original: &str, left: &str, right: &str) -> Result<String, Sync
|
||||||
merged_operations.apply()
|
merged_operations.apply()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn reconcile_with_tokenizer<F, T>(
|
||||||
|
original: &str,
|
||||||
|
left: &str,
|
||||||
|
right: &str,
|
||||||
|
tokenizer: &F,
|
||||||
|
) -> Result<String, SyncLibError>
|
||||||
|
where
|
||||||
|
F: Fn(&str) -> Vec<Token<T>>,
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
|
let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
|
||||||
|
let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
|
||||||
|
|
||||||
|
let merged_operations = left_operations.merge(right_operations);
|
||||||
|
merged_operations.apply()
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use std::{fs, ops::Range, path::Path};
|
use std::{fs, ops::Range, path::Path};
|
||||||
|
|
|
||||||
|
|
@ -1 +1,2 @@
|
||||||
pub mod token;
|
pub mod token;
|
||||||
|
pub mod word_tokenizer;
|
||||||
|
|
|
||||||
|
|
@ -1,26 +1,52 @@
|
||||||
|
use std::hash::Hash;
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Token {
|
pub struct Token<T>
|
||||||
pub normalised: String,
|
where
|
||||||
pub original: String,
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
|
normalised: T,
|
||||||
|
original: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Token {
|
impl<T> Token<T>
|
||||||
pub fn new(normalised: String, original: String) -> Self {
|
where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
|
pub fn new(normalised: T, original: String) -> Self {
|
||||||
Token {
|
Token {
|
||||||
normalised,
|
normalised,
|
||||||
original,
|
original,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn tokenize(text: &str) -> Vec<Token> {
|
pub fn original(&self) -> &str {
|
||||||
text.split_inclusive(|c: char| c.is_whitespace())
|
&self.original
|
||||||
.map(|s| Token::new(s.to_string(), s.to_string()))
|
}
|
||||||
.collect()
|
|
||||||
|
pub fn normalised(&self) -> &T {
|
||||||
|
&self.normalised
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_original_length(&self) -> usize {
|
||||||
|
self.original.chars().count()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for Token {
|
impl<T> PartialEq for Token<T>
|
||||||
|
where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.normalised == other.normalised
|
self.normalised == other.normalised
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<T> Hash for Token<T>
|
||||||
|
where
|
||||||
|
T: PartialEq + Hash + Clone,
|
||||||
|
{
|
||||||
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||||
|
self.normalised.hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
7
backend/reconcile/src/tokenizer/word_tokenizer.rs
Normal file
7
backend/reconcile/src/tokenizer/word_tokenizer.rs
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
use super::token::Token;
|
||||||
|
|
||||||
|
pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
|
||||||
|
text.split_inclusive(char::is_whitespace)
|
||||||
|
.map(|s| Token::new(s.to_string(), s.to_string()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue