Dedupe inserts

This commit is contained in:
Andras Schmelczer 2025-03-02 14:54:57 +00:00
parent a93c17711c
commit d7ae0a781d
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
5 changed files with 145 additions and 93 deletions

View file

@ -37,7 +37,7 @@ pub fn reconcile_with_tokenizer<F, T>(
tokenizer: &Tokenizer<T>,
) -> String
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
let left_operations = EditedText::from_strings_with_tokenizer(original, left, tokenizer);
let right_operations = EditedText::from_strings_with_tokenizer(original, right, tokenizer);
@ -120,9 +120,6 @@ mod test {
"hi, my friend!",
);
// test_merge_both_ways("hello world", "world !", "hi hello world", "hi world
// !");
test_merge_both_ways(
"both delete the same word",
"both the same word",
@ -147,7 +144,25 @@ mod test {
);
}
#[ignore = "it's too slow"]
#[test]
fn test_reconcile_idempotent_inserts() {
// Both inserted the same prefix; this should get deduped
test_merge_both_ways(
"hi ",
"hi there ",
"hi there my friend",
"hi there my friend",
);
// The prefix of the 2nd appears on the 1st so it shouldn't get duplicated
test_merge_both_ways(
"hi ",
"hi there you ",
"hi there my friend",
"hi there you my friend",
);
}
#[test_matrix( [
"pride_and_prejudice.txt",
"romeo_and_juliet.txt",

View file

@ -2,14 +2,18 @@ use core::{
fmt::{Debug, Display},
ops::Range,
};
use std::cmp::min;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use super::merge_context::MergeContext;
use crate::{
utils::{
find_longest_prefix_contained_within::find_longest_prefix_contained_within,
string_builder::StringBuilder,
},
Token,
utils::{find_common_overlap::find_common_overlap, string_builder::StringBuilder},
};
/// Represents a change that can be applied to a text document.
@ -19,7 +23,7 @@ use crate::{
#[derive(Clone, PartialEq)]
pub enum Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
Insert {
index: usize,
@ -37,7 +41,7 @@ where
impl<T> Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
/// Creates an insert operation with the given index and text.
/// If the text is empty (meaning that the operation would be a no-op),
@ -212,17 +216,20 @@ where
..
}),
) => {
let offset_in_tokens = find_common_overlap(previous_inserted_text, &text);
let trimmed_length_in_tokens = previous_inserted_text.len() - offset_in_tokens;
let trimmed_length = previous_inserted_text
// In case the current insert's prefix appears in the previously inserted text,
// we can trim the current insert to only include the non-overlapping part.
// This way, we don't end up duplicating text.
let offset_in_tokens =
find_longest_prefix_contained_within(previous_inserted_text, &text);
let offset_in_length = text
.iter()
.skip(offset_in_tokens)
.take(offset_in_tokens)
.map(Token::get_original_length)
.sum::<usize>();
let trimmed_operation =
Operation::create_insert(index, text[trimmed_length_in_tokens..].to_vec());
Operation::create_insert(index, text[offset_in_tokens..].to_vec());
affecting_context.shift -= trimmed_length as i64;
affecting_context.shift -= offset_in_length as i64;
produced_context.shift += trimmed_operation
.as_ref()
.map(Operation::len)
@ -297,7 +304,7 @@ where
impl<T> Display for Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
@ -341,7 +348,7 @@ where
impl<T> Debug for Operation<T>
where
T: PartialEq + Clone,
T: PartialEq + Clone + std::fmt::Debug,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{self}") }
}
@ -355,11 +362,9 @@ mod tests {
#[test]
#[should_panic]
fn test_shifting_error() {
insta::assert_debug_snapshot!(
Operation::create_insert(1, vec!["hi".into()])
.unwrap()
.with_shifted_index(-2)
);
insta::assert_debug_snapshot!(Operation::create_insert(1, vec!["hi".into()])
.unwrap()
.with_shifted_index(-2));
}
#[test]

View file

@ -1,6 +1,6 @@
pub mod common_prefix_len;
pub mod common_suffix_len;
pub mod find_common_overlap;
pub mod find_longest_prefix_contained_within;
pub mod merge_iters;
pub mod ordered_operation;
pub mod side;

View file

@ -1,71 +0,0 @@
use crate::Token;
/// Given two lists of tokens, returns the offset in the first (old) list from
/// which the two lists have the same tokens until the end of the first list.
/// Thus, the suffix of the old list from the offset to the end is equal to a
/// prefix of the new list.
///
/// If there is no overlap, the function returns the maxmium offset, the length
/// of the old list.
///
/// ## Example
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [9, 0, 2, 5, 1]
/// ```
/// > results in an offset of 2
pub fn find_common_overlap<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
where
T: PartialEq + Clone,
{
let minimum_offset = old.len().saturating_sub(new.len());
for offset in minimum_offset..old.len() {
if old.iter().skip(offset).zip(new.iter()).all(|(a, b)| a == b) {
return offset;
}
}
old.len()
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_common_overlap() {
assert_eq!(find_common_overlap(&["".into()], &["".into()]), 0);
assert_eq!(
find_common_overlap(
&["a".into(), "b".into(), "c".into()],
&["b".into(), "c".into(), "a".into()]
),
1
);
assert_eq!(
find_common_overlap(
&["a".into(), "a".into(), "a".into()],
&["a".into(), "b".into(), "c".into()]
),
2
);
assert_eq!(
find_common_overlap(
&["a".into(), "b".into(), "c".into()],
&["d".into(), "e".into(), "a".into()]
),
3
);
assert_eq!(
find_common_overlap(&["a".into(), "a".into()], &["a".into()]),
1
);
}
}

View file

@ -0,0 +1,103 @@
use crate::Token;
/// Given two lists of tokens, returns `length` where `old` list somewhere
/// within contains the `length` prefix of the `new` list.
///
/// ## Example
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [9, 0, 2, 5, 1]
/// ```
/// > results in an length of 4
///
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [0, 2]
/// ```
/// > results in an length of 2
///
/// ```not_rust
/// old: [0, 1, 9, 0, 2, 5]
/// new: [0, 4]
/// ```
/// > results in an length of 1
pub fn find_longest_prefix_contained_within<T>(old: &[Token<T>], new: &[Token<T>]) -> usize
where
T: PartialEq + Clone + std::fmt::Debug,
{
let max_possible = new.len().min(old.len());
for len in (1..=max_possible).rev() {
let prefix = &new[..len];
if old.windows(len).any(|window| window == prefix) {
return len;
}
}
0
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
#[test]
fn test_common_overlap() {
assert_eq!(
find_longest_prefix_contained_within(&["".into()], &["".into()]),
1
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["b".into(), "c".into(), "a".into()]
),
2
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["b".into(), "c".into()]
),
2
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["b".into()]
),
1
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into(), "b".into(), "a".into()],
&["b".into(), "a".into()]
),
2
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "a".into(), "a".into()],
&["a".into(), "b".into(), "c".into()]
),
1
);
assert_eq!(
find_longest_prefix_contained_within(
&["a".into(), "b".into(), "c".into()],
&["d".into(), "e".into(), "a".into()]
),
0
);
}
}