Dedupe inserts
This commit is contained in:
parent
a93c17711c
commit
d7ae0a781d
5 changed files with 145 additions and 93 deletions
|
|
@ -37,7 +37,7 @@ pub fn reconcile_with_tokenizer<F, T>(
|
|||
tokenizer: &Tokenizer<T>,
|
||||
) -> String
|
||||
where
|
||||
T: PartialEq + Clone,
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
|
||||
let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
|
||||
|
|
@ -120,9 +120,6 @@ mod test {
|
|||
"hi, my friend!",
|
||||
);
|
||||
|
||||
// test_merge_both_ways("hello world", "world !", "hi hello world", "hi world
|
||||
// !");
|
||||
|
||||
test_merge_both_ways(
|
||||
"both delete the same word",
|
||||
"both the same word",
|
||||
|
|
@ -147,7 +144,25 @@ mod test {
|
|||
);
|
||||
}
|
||||
|
||||
#[ignore = "it's too slow"]
|
||||
#[test]
|
||||
fn test_reconcile_idempotent_inserts() {
|
||||
// Both inserted the same prefix; this should get deduped
|
||||
test_merge_both_ways(
|
||||
"hi ",
|
||||
"hi there ",
|
||||
"hi there my friend",
|
||||
"hi there my friend",
|
||||
);
|
||||
|
||||
// The prefix of the 2nd appears on the 1st so it shouldn't get duplicated
|
||||
test_merge_both_ways(
|
||||
"hi ",
|
||||
"hi there you ",
|
||||
"hi there my friend",
|
||||
"hi there you my friend",
|
||||
);
|
||||
}
|
||||
|
||||
#[test_matrix( [
|
||||
"pride_and_prejudice.txt",
|
||||
"romeo_and_juliet.txt",
|
||||
|
|
|
|||
|
|
@ -2,14 +2,18 @@ use core::{
|
|||
fmt::{Debug, Display},
|
||||
ops::Range,
|
||||
};
|
||||
use std::cmp::min;
|
||||
|
||||
#[cfg(feature = "serde")]
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::merge_context::MergeContext;
|
||||
use crate::{
|
||||
utils::{
|
||||
find_longest_prefix_contained_within::find_longest_prefix_contained_within,
|
||||
string_builder::StringBuilder,
|
||||
},
|
||||
Token,
|
||||
utils::{find_common_overlap::find_common_overlap, string_builder::StringBuilder},
|
||||
};
|
||||
|
||||
/// Represents a change that can be applied to a text document.
|
||||
|
|
@ -19,7 +23,7 @@ use crate::{
|
|||
#[derive(Clone, PartialEq)]
|
||||
pub enum Operation<T>
|
||||
where
|
||||
T: PartialEq + Clone,
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
Insert {
|
||||
index: usize,
|
||||
|
|
@ -37,7 +41,7 @@ where
|
|||
|
||||
impl<T> Operation<T>
|
||||
where
|
||||
T: PartialEq + Clone,
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
/// Creates an insert operation with the given index and text.
|
||||
/// If the text is empty (meaning that the operation would be a no-op),
|
||||
|
|
@ -212,17 +216,20 @@ where
|
|||
..
|
||||
}),
|
||||
) => {
|
||||
let offset_in_tokens = find_common_overlap(previous_inserted_text, &text);
|
||||
let trimmed_length_in_tokens = previous_inserted_text.len() - offset_in_tokens;
|
||||
let trimmed_length = previous_inserted_text
|
||||
// In case the current insert's prefix appears in the previously inserted text,
|
||||
// we can trim the current insert to only include the non-overlapping part.
|
||||
// This way, we don't end up duplicating text.
|
||||
let offset_in_tokens =
|
||||
find_longest_prefix_contained_within(previous_inserted_text, &text);
|
||||
let offset_in_length = text
|
||||
.iter()
|
||||
.skip(offset_in_tokens)
|
||||
.take(offset_in_tokens)
|
||||
.map(Token::get_original_length)
|
||||
.sum::<usize>();
|
||||
let trimmed_operation =
|
||||
Operation::create_insert(index, text[trimmed_length_in_tokens..].to_vec());
|
||||
Operation::create_insert(index, text[offset_in_tokens..].to_vec());
|
||||
|
||||
affecting_context.shift -= trimmed_length as i64;
|
||||
affecting_context.shift -= offset_in_length as i64;
|
||||
produced_context.shift += trimmed_operation
|
||||
.as_ref()
|
||||
.map(Operation::len)
|
||||
|
|
@ -297,7 +304,7 @@ where
|
|||
|
||||
impl<T> Display for Operation<T>
|
||||
where
|
||||
T: PartialEq + Clone,
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self {
|
||||
|
|
@ -341,7 +348,7 @@ where
|
|||
|
||||
impl<T> Debug for Operation<T>
|
||||
where
|
||||
T: PartialEq + Clone,
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{self}") }
|
||||
}
|
||||
|
|
@ -355,11 +362,9 @@ mod tests {
|
|||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shifting_error() {
|
||||
insta::assert_debug_snapshot!(
|
||||
Operation::create_insert(1, vec!["hi".into()])
|
||||
insta::assert_debug_snapshot!(Operation::create_insert(1, vec!["hi".into()])
|
||||
.unwrap()
|
||||
.with_shifted_index(-2)
|
||||
);
|
||||
.with_shifted_index(-2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
pub mod common_prefix_len;
|
||||
pub mod common_suffix_len;
|
||||
pub mod find_common_overlap;
|
||||
pub mod find_longest_prefix_contained_within;
|
||||
pub mod merge_iters;
|
||||
pub mod ordered_operation;
|
||||
pub mod side;
|
||||
|
|
|
|||
|
|
@ -1,71 +0,0 @@
|
|||
use crate::Token;
|
||||
|
||||
/// Given two lists of tokens, returns the offset in the first (old) list from
|
||||
/// which the two lists have the same tokens until the end of the first list.
|
||||
/// Thus, the suffix of the old list from the offset to the end is equal to a
|
||||
/// prefix of the new list.
|
||||
///
|
||||
/// If there is no overlap, the function returns the maxmium offset, the length
|
||||
/// of the old list.
|
||||
///
|
||||
/// ## Example
|
||||
///
|
||||
/// ```not_rust
|
||||
/// old: [0, 1, 9, 0, 2, 5]
|
||||
/// new: [9, 0, 2, 5, 1]
|
||||
/// ```
|
||||
/// > results in an offset of 2
|
||||
pub fn find_common_overlap<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
|
||||
where
|
||||
T: PartialEq + Clone,
|
||||
{
|
||||
let minimum_offset = old.len().saturating_sub(new.len());
|
||||
for offset in minimum_offset..old.len() {
|
||||
if old.iter().skip(offset).zip(new.iter()).all(|(a, b)| a == b) {
|
||||
return offset;
|
||||
}
|
||||
}
|
||||
|
||||
old.len()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_common_overlap() {
|
||||
assert_eq!(find_common_overlap(&["".into()], &["".into()]), 0);
|
||||
|
||||
assert_eq!(
|
||||
find_common_overlap(
|
||||
&["a".into(), "b".into(), "c".into()],
|
||||
&["b".into(), "c".into(), "a".into()]
|
||||
),
|
||||
1
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_common_overlap(
|
||||
&["a".into(), "a".into(), "a".into()],
|
||||
&["a".into(), "b".into(), "c".into()]
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_common_overlap(
|
||||
&["a".into(), "b".into(), "c".into()],
|
||||
&["d".into(), "e".into(), "a".into()]
|
||||
),
|
||||
3
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_common_overlap(&["a".into(), "a".into()], &["a".into()]),
|
||||
1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
use crate::Token;
|
||||
|
||||
/// Given two lists of tokens, returns `length` where `old` list somewhere
|
||||
/// within contains the `length` prefix of the `new` list.
|
||||
///
|
||||
/// ## Example
|
||||
///
|
||||
/// ```not_rust
|
||||
/// old: [0, 1, 9, 0, 2, 5]
|
||||
/// new: [9, 0, 2, 5, 1]
|
||||
/// ```
|
||||
/// > results in an length of 4
|
||||
///
|
||||
///
|
||||
/// ```not_rust
|
||||
/// old: [0, 1, 9, 0, 2, 5]
|
||||
/// new: [0, 2]
|
||||
/// ```
|
||||
/// > results in an length of 2
|
||||
///
|
||||
/// ```not_rust
|
||||
/// old: [0, 1, 9, 0, 2, 5]
|
||||
/// new: [0, 4]
|
||||
/// ```
|
||||
/// > results in an length of 1
|
||||
pub fn find_longest_prefix_contained_within<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
|
||||
where
|
||||
T: PartialEq + Clone + std::fmt::Debug,
|
||||
{
|
||||
let max_possible = new.len().min(old.len());
|
||||
|
||||
for len in (1..=max_possible).rev() {
|
||||
let prefix = &new[..len];
|
||||
if old.windows(len).any(|window| window == prefix) {
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_common_overlap() {
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(&["".into()], &["".into()]),
|
||||
1
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(
|
||||
&["a".into(), "b".into(), "c".into()],
|
||||
&["b".into(), "c".into(), "a".into()]
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(
|
||||
&["a".into(), "b".into(), "c".into()],
|
||||
&["b".into(), "c".into()]
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(
|
||||
&["a".into(), "b".into(), "c".into()],
|
||||
&["b".into()]
|
||||
),
|
||||
1
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(
|
||||
&["a".into(), "b".into(), "c".into(), "b".into(), "a".into()],
|
||||
&["b".into(), "a".into()]
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(
|
||||
&["a".into(), "a".into(), "a".into()],
|
||||
&["a".into(), "b".into(), "c".into()]
|
||||
),
|
||||
1
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
find_longest_prefix_contained_within(
|
||||
&["a".into(), "b".into(), "c".into()],
|
||||
&["d".into(), "e".into(), "a".into()]
|
||||
),
|
||||
0
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue