Merge crates
This commit is contained in:
parent
82e77eec89
commit
bcbac03228
60 changed files with 73 additions and 248 deletions
357
src/diffs/myers.rs
Normal file
357
src/diffs/myers.rs
Normal file
|
|
@ -0,0 +1,357 @@
|
|||
//! Taken from <https://github.com/mitsuhiko/similar/blob/7e15c44de11a1cd61e1149189929e189ef977fd8/src/algorithms/myers.rs>
|
||||
//!
|
||||
//! Myers' diff algorithm.
|
||||
//!
|
||||
//! * time: `O((N+M)D)`
|
||||
//! * space `O(N+M)`
|
||||
//!
|
||||
//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
|
||||
//! describing it.
|
||||
//!
|
||||
//! The implementation of this algorithm is based on the implementation by
|
||||
//! Brandon Williams.
|
||||
//!
|
||||
//! # Heuristics
|
||||
//!
|
||||
//! At present this implementation of Myers' does not implement any more
|
||||
//! advanced heuristics that would solve some pathological cases. For instance
|
||||
//! passing two large and completely distinct sequences to the algorithm will
|
||||
//! make it spin without making reasonable progress.
|
||||
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
|
||||
|
||||
use std::{
|
||||
ops::{Index, IndexMut, Range},
|
||||
vec,
|
||||
};
|
||||
|
||||
use super::raw_operation::RawOperation;
|
||||
use crate::{
|
||||
tokenizer::token::Token,
|
||||
utils::{common_prefix_len::common_prefix_len, common_suffix_len::common_suffix_len},
|
||||
};
|
||||
|
||||
/// Myers' diff algorithm.
|
||||
///
|
||||
/// Diff `old`, between indices `old_range` and `new` between indices
|
||||
/// `new_range`.
|
||||
///
|
||||
/// The returned `RawOperations` all have a token count of 1.
|
||||
pub fn diff<T>(old: &[Token<T>], new: &[Token<T>]) -> Vec<RawOperation<T>>
|
||||
where
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
let max_d = (old.len() + new.len()).div_ceil(2) + 1;
|
||||
let mut vb = V::new(max_d);
|
||||
let mut vf = V::new(max_d);
|
||||
let mut result: Vec<RawOperation<T>> = vec![];
|
||||
|
||||
conquer(
|
||||
old,
|
||||
0..old.len(),
|
||||
new,
|
||||
0..new.len(),
|
||||
&mut vf,
|
||||
&mut vb,
|
||||
&mut result,
|
||||
);
|
||||
|
||||
debug_assert!(
|
||||
result.iter().all(|op| op.tokens().len() == 1),
|
||||
"All operations should be of length 1"
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
|
||||
// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
|
||||
// and then a possibly empty sequence of diagonal edges called a snake.
|
||||
|
||||
/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
|
||||
/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x`
|
||||
/// because `y` can be computed from `x - k`. In other words, `V` is an array of
|
||||
/// integers where `V[k]` contains the row index of the endpoint of the furthest
|
||||
/// reaching path in diagonal `k`.
|
||||
///
|
||||
/// We can't use a traditional Vec to represent `V` since we use `k` as an index
|
||||
/// and it can take on negative values. So instead `V` is represented as a
|
||||
/// light-weight wrapper around a Vec plus an `offset` which is the maximum
|
||||
/// value `k` can take on in order to map negative `k`'s back to a value >= 0.
|
||||
#[derive(Debug)]
|
||||
struct V {
|
||||
offset: isize,
|
||||
v: Vec<usize>, // Look into initializing this to -1 and storing isize
|
||||
}
|
||||
|
||||
impl V {
|
||||
fn new(max_d: usize) -> Self {
|
||||
Self {
|
||||
offset: max_d as isize,
|
||||
v: vec![0; 2 * max_d],
|
||||
}
|
||||
}
|
||||
|
||||
fn len(&self) -> usize { self.v.len() }
|
||||
}
|
||||
|
||||
impl Index<isize> for V {
|
||||
type Output = usize;
|
||||
|
||||
fn index(&self, index: isize) -> &Self::Output { &self.v[(index + self.offset) as usize] }
|
||||
}
|
||||
|
||||
impl IndexMut<isize> for V {
|
||||
fn index_mut(&mut self, index: isize) -> &mut Self::Output {
|
||||
&mut self.v[(index + self.offset) as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
|
||||
(range.start..at, at..range.end)
|
||||
}
|
||||
|
||||
/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
|
||||
/// a snake has a start end end point (and it is possible for a snake to have
|
||||
/// a length of zero, meaning the start and end points are the same) however
|
||||
/// we do not need the end point which is why it's not implemented here.
|
||||
///
|
||||
/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
|
||||
/// some of which may be empty. The divide step requires finding the ceil(D/2) +
|
||||
/// 1 or middle snake of an optimal D-path. The idea for doing so is to
|
||||
/// simultaneously run the basic algorithm in both the forward and reverse
|
||||
/// directions until furthest reaching forward and reverse paths starting at
|
||||
/// opposing corners 'overlap'.
|
||||
fn find_middle_snake<T>(
|
||||
old: &[Token<T>],
|
||||
old_range: Range<usize>,
|
||||
new: &[Token<T>],
|
||||
new_range: Range<usize>,
|
||||
vf: &mut V,
|
||||
vb: &mut V,
|
||||
) -> Option<(usize, usize)>
|
||||
where
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
let n = old_range.len();
|
||||
let m = new_range.len();
|
||||
|
||||
// By Lemma 1 in the paper, the optimal edit script length is odd or even as
|
||||
// `delta` is odd or even.
|
||||
let delta = n as isize - m as isize;
|
||||
let odd = delta & 1 == 1;
|
||||
|
||||
// The initial point at (0, -1)
|
||||
vf[1] = 0;
|
||||
// The initial point at (N, M+1)
|
||||
vb[1] = 0;
|
||||
|
||||
let d_max = (n + m).div_ceil(2) + 1;
|
||||
assert!(vf.len() >= d_max);
|
||||
assert!(vb.len() >= d_max);
|
||||
|
||||
for d in 0..d_max as isize {
|
||||
// Forward path
|
||||
for k in (-d..=d).rev().step_by(2) {
|
||||
let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
|
||||
vf[k + 1]
|
||||
} else {
|
||||
vf[k - 1] + 1
|
||||
};
|
||||
let y = (x as isize - k) as usize;
|
||||
|
||||
// The coordinate of the start of a snake
|
||||
let (x0, y0) = (x, y);
|
||||
// While these sequences are identical, keep moving through the
|
||||
// graph with no cost
|
||||
if x < old_range.len() && y < new_range.len() {
|
||||
let advance = common_prefix_len(
|
||||
old,
|
||||
old_range.start + x..old_range.end,
|
||||
new,
|
||||
new_range.start + y..new_range.end,
|
||||
);
|
||||
x += advance;
|
||||
}
|
||||
|
||||
// This is the new best x value
|
||||
vf[k] = x;
|
||||
|
||||
// Only check for connections from the forward search when N - M is
|
||||
// odd and when there is a reciprocal k line coming from the other
|
||||
// direction.
|
||||
if odd && (k - delta).abs() <= (d - 1) {
|
||||
// TODO optimize this so we don't have to compare against n
|
||||
if vf[k] + vb[-(k - delta)] >= n {
|
||||
// Return the snake
|
||||
return Some((x0 + old_range.start, y0 + new_range.start));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Backward path
|
||||
for k in (-d..=d).rev().step_by(2) {
|
||||
let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
|
||||
vb[k + 1]
|
||||
} else {
|
||||
vb[k - 1] + 1
|
||||
};
|
||||
let mut y = (x as isize - k) as usize;
|
||||
|
||||
// The coordinate of the start of a snake
|
||||
if x < n && y < m {
|
||||
let advance = common_suffix_len(
|
||||
old,
|
||||
old_range.start..old_range.start + n - x,
|
||||
new,
|
||||
new_range.start..new_range.start + m - y,
|
||||
);
|
||||
x += advance;
|
||||
y += advance;
|
||||
}
|
||||
|
||||
// This is the new best x value
|
||||
vb[k] = x;
|
||||
|
||||
if !odd && (k - delta).abs() <= d {
|
||||
// TODO optimize this so we don't have to compare against n
|
||||
if vb[k] + vf[-(k - delta)] >= n {
|
||||
// Return the snake
|
||||
return Some((n - x + old_range.start, m - y + new_range.start));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Maybe there's an opportunity to optimize and bail early?
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn conquer<T>(
|
||||
old: &[Token<T>],
|
||||
mut old_range: Range<usize>,
|
||||
new: &[Token<T>],
|
||||
mut new_range: Range<usize>,
|
||||
vf: &mut V,
|
||||
vb: &mut V,
|
||||
result: &mut Vec<RawOperation<T>>,
|
||||
) where
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
// Check for common prefix
|
||||
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
|
||||
if common_prefix_len > 0 {
|
||||
result.extend(
|
||||
old[old_range.start..old_range.start + common_prefix_len]
|
||||
.iter()
|
||||
.map(|token| RawOperation::Equal(vec![token.clone()])),
|
||||
);
|
||||
}
|
||||
old_range.start += common_prefix_len;
|
||||
new_range.start += common_prefix_len;
|
||||
|
||||
// Check for common suffix
|
||||
let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
|
||||
let common_suffix = (
|
||||
old_range.end - common_suffix_len,
|
||||
new_range.end - common_suffix_len,
|
||||
);
|
||||
old_range.end -= common_suffix_len;
|
||||
new_range.end -= common_suffix_len;
|
||||
|
||||
if old_range.is_empty() && new_range.is_empty() {
|
||||
// do nothing
|
||||
} else if new_range.is_empty() {
|
||||
result.extend(
|
||||
old[old_range.start..old_range.start + old_range.len()]
|
||||
.iter()
|
||||
.map(|token| RawOperation::Delete(vec![token.clone()])),
|
||||
);
|
||||
} else if old_range.is_empty() {
|
||||
result.extend(
|
||||
new[new_range.start..new_range.start + new_range.len()]
|
||||
.iter()
|
||||
.map(|token| RawOperation::Insert(vec![token.clone()])),
|
||||
);
|
||||
} else if let Some((x_start, y_start)) =
|
||||
find_middle_snake(old, old_range.clone(), new, new_range.clone(), vf, vb)
|
||||
{
|
||||
let (old_a, old_b) = split_at(old_range, x_start);
|
||||
let (new_a, new_b) = split_at(new_range, y_start);
|
||||
conquer(old, old_a, new, new_a, vf, vb, result);
|
||||
conquer(old, old_b, new, new_b, vf, vb, result);
|
||||
} else {
|
||||
result.extend(
|
||||
old[old_range.start..old_range.end]
|
||||
.iter()
|
||||
.map(|token| RawOperation::Delete(vec![token.clone()])),
|
||||
);
|
||||
result.extend(
|
||||
new[new_range.start..new_range.end]
|
||||
.iter()
|
||||
.map(|token| RawOperation::Insert(vec![token.clone()])),
|
||||
);
|
||||
}
|
||||
|
||||
if common_suffix_len > 0 {
|
||||
result.extend(
|
||||
old[common_suffix.0..common_suffix.0 + common_suffix_len]
|
||||
.iter()
|
||||
.map(|token| RawOperation::Equal(vec![token.clone()])),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use insta::assert_debug_snapshot;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_empty_diff() {
|
||||
let old: Vec<Token<String>> = vec![];
|
||||
let new: Vec<Token<String>> = vec![];
|
||||
let result = diff(&old, &new);
|
||||
assert_eq!(result.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identical_content() {
|
||||
let content = vec!["a".into(), "b".into(), "c".into()];
|
||||
let result = diff(&content, &content);
|
||||
assert_debug_snapshot!(result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_insert_only() {
|
||||
let old: Vec<Token<String>> = vec![];
|
||||
let new: Vec<Token<String>> = vec!["a".into(), "b".into()];
|
||||
let result = diff(&old, &new);
|
||||
assert_debug_snapshot!(result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_only() {
|
||||
let old = vec!["a".into(), "b".into()];
|
||||
let new: Vec<Token<String>> = vec![];
|
||||
let result = diff(&old, &new);
|
||||
assert_debug_snapshot!(result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix_and_suffix() {
|
||||
let old = vec!["a".into(), "b".into(), "c".into(), "d".into()];
|
||||
let new = vec!["a".into(), "x".into(), "d".into()];
|
||||
let result = diff(&old, &new);
|
||||
assert_debug_snapshot!(result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_complex_diff() {
|
||||
let old = vec!["a".into(), "b".into(), "c".into(), "d".into()];
|
||||
let new = vec!["a".into(), "x".into(), "c".into(), "y".into()];
|
||||
let result = diff(&old, &new);
|
||||
assert_debug_snapshot!(result);
|
||||
}
|
||||
}
|
||||
64
src/diffs/raw_operation.rs
Normal file
64
src/diffs/raw_operation.rs
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
use crate::tokenizer::token::Token;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum RawOperation<T>
|
||||
where
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
Insert(Vec<Token<T>>),
|
||||
Delete(Vec<Token<T>>),
|
||||
Equal(Vec<Token<T>>),
|
||||
}
|
||||
|
||||
impl<T> RawOperation<T>
|
||||
where
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
pub fn tokens(&self) -> &Vec<Token<T>> {
|
||||
match self {
|
||||
RawOperation::Insert(tokens)
|
||||
| RawOperation::Delete(tokens)
|
||||
| RawOperation::Equal(tokens) => tokens,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn original_text_length(&self) -> usize {
|
||||
self.tokens().iter().map(Token::get_original_length).sum()
|
||||
}
|
||||
|
||||
pub fn get_original_text(self) -> String { self.tokens().iter().map(Token::original).collect() }
|
||||
|
||||
pub fn is_left_joinable(&self) -> bool {
|
||||
let first_token = self.tokens().first();
|
||||
first_token.is_none_or(|token| token.is_left_joinable)
|
||||
}
|
||||
|
||||
pub fn is_right_joinable(&self) -> bool {
|
||||
let last_token = self.tokens().last();
|
||||
last_token.is_none_or(|token| token.is_right_joinable)
|
||||
}
|
||||
|
||||
/// Extends the operation with another operation. Only operations of the
|
||||
/// same type as self can be used to extend self, otherwise the function
|
||||
/// will panic.
|
||||
pub fn extend(self, other: RawOperation<T>) -> RawOperation<T> {
|
||||
debug_assert!(
|
||||
std::mem::discriminant(&self) == std::mem::discriminant(&other),
|
||||
"Cannot extend operations of different types. This should have been handled before \
|
||||
calling this function."
|
||||
);
|
||||
|
||||
match (self, other) {
|
||||
(RawOperation::Insert(self_tokens), RawOperation::Insert(other_tokens)) => {
|
||||
RawOperation::Insert(self_tokens.into_iter().chain(other_tokens).collect())
|
||||
}
|
||||
(RawOperation::Delete(tokens1), RawOperation::Delete(tokens2)) => {
|
||||
RawOperation::Delete(tokens1.into_iter().chain(tokens2).collect())
|
||||
}
|
||||
(RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => {
|
||||
RawOperation::Equal(tokens1.into_iter().chain(tokens2).collect())
|
||||
}
|
||||
_ => unreachable!("Only operations of the same type can be extended"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
---
|
||||
source: reconcile/src/diffs/myers.rs
|
||||
expression: result
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "a",
|
||||
original: "a",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Insert(
|
||||
[
|
||||
Token {
|
||||
normalised: "x",
|
||||
original: "x",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Delete(
|
||||
[
|
||||
Token {
|
||||
normalised: "b",
|
||||
original: "b",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "c",
|
||||
original: "c",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Insert(
|
||||
[
|
||||
Token {
|
||||
normalised: "y",
|
||||
original: "y",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Delete(
|
||||
[
|
||||
Token {
|
||||
normalised: "d",
|
||||
original: "d",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
---
|
||||
source: reconcile/src/diffs/myers.rs
|
||||
expression: result
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Delete(
|
||||
[
|
||||
Token {
|
||||
normalised: "a",
|
||||
original: "a",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Delete(
|
||||
[
|
||||
Token {
|
||||
normalised: "b",
|
||||
original: "b",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
---
|
||||
source: reconcile/src/diffs/myers.rs
|
||||
expression: result
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "a",
|
||||
original: "a",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "b",
|
||||
original: "b",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "c",
|
||||
original: "c",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
---
|
||||
source: reconcile/src/diffs/myers.rs
|
||||
expression: result
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Insert(
|
||||
[
|
||||
Token {
|
||||
normalised: "a",
|
||||
original: "a",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Insert(
|
||||
[
|
||||
Token {
|
||||
normalised: "b",
|
||||
original: "b",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
source: reconcile/src/diffs/myers.rs
|
||||
expression: result
|
||||
snapshot_kind: text
|
||||
---
|
||||
[
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "a",
|
||||
original: "a",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Delete(
|
||||
[
|
||||
Token {
|
||||
normalised: "b",
|
||||
original: "b",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Delete(
|
||||
[
|
||||
Token {
|
||||
normalised: "c",
|
||||
original: "c",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Insert(
|
||||
[
|
||||
Token {
|
||||
normalised: "x",
|
||||
original: "x",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
Equal(
|
||||
[
|
||||
Token {
|
||||
normalised: "d",
|
||||
original: "d",
|
||||
is_left_joinable: true,
|
||||
is_right_joinable: true,
|
||||
},
|
||||
],
|
||||
),
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue