Add left/right joinability for tokens

This commit is contained in:
Andras Schmelczer 2025-04-05 13:48:02 +01:00
parent b0c6c082a1
commit b230d34b88
No known key found for this signature in database
GPG key ID: FC8F2C3D3D1A718C
13 changed files with 313 additions and 75 deletions

View file

@ -28,10 +28,26 @@ where
pub fn get_original_text(self) -> String { self.tokens().iter().map(Token::original).collect() }
/// Extends the operation with another operation if returning the new
/// operation. Only operations of the same type can be used to extend.
/// If the operations are of different types, returns None.
pub fn is_left_joinable(&self) -> bool {
let first_token = self.tokens().first();
first_token.map_or(true, |t| t.get_is_left_joinable())
}
pub fn is_right_joinable(&self) -> bool {
let last_token = self.tokens().last();
last_token.map_or(true, |t| t.get_is_right_joinable())
}
/// Extends the operation with another operation when it returns Some
/// operation. Only operations of the same type as self can be used to
/// extend self. If the operations are of different types, returns None.
pub fn extend(self, other: RawOperation<T>) -> Option<RawOperation<T>> {
debug_assert!(
std::mem::discriminant(&self) == std::mem::discriminant(&other),
"Cannot extend operations of different types. This should have been handled before \
calling this function."
);
match (self, other) {
(RawOperation::Insert(tokens1), RawOperation::Insert(tokens2)) => Some(
RawOperation::Insert(tokens1.into_iter().chain(tokens2).collect()),
@ -42,7 +58,7 @@ where
(RawOperation::Equal(tokens1), RawOperation::Equal(tokens2)) => Some(
RawOperation::Equal(tokens1.into_iter().chain(tokens2).collect()),
),
_ => None,
_ => unreachable!("Only operations of the same type can be extended"),
}
}
}

View file

@ -9,6 +9,8 @@ snapshot_kind: text
Token {
normalised: "a",
original: "a",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -17,6 +19,8 @@ snapshot_kind: text
Token {
normalised: "x",
original: "x",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -25,6 +29,8 @@ snapshot_kind: text
Token {
normalised: "b",
original: "b",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -33,6 +39,8 @@ snapshot_kind: text
Token {
normalised: "c",
original: "c",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -41,6 +49,8 @@ snapshot_kind: text
Token {
normalised: "y",
original: "y",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -49,6 +59,8 @@ snapshot_kind: text
Token {
normalised: "d",
original: "d",
is_left_joinable: true,
is_right_joinable: true,
},
],
),

View file

@ -9,10 +9,14 @@ snapshot_kind: text
Token {
normalised: "a",
original: "a",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "b",
original: "b",
is_left_joinable: true,
is_right_joinable: true,
},
],
),

View file

@ -9,14 +9,20 @@ snapshot_kind: text
Token {
normalised: "a",
original: "a",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "b",
original: "b",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "c",
original: "c",
is_left_joinable: true,
is_right_joinable: true,
},
],
),

View file

@ -9,10 +9,14 @@ snapshot_kind: text
Token {
normalised: "a",
original: "a",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "b",
original: "b",
is_left_joinable: true,
is_right_joinable: true,
},
],
),

View file

@ -9,6 +9,8 @@ snapshot_kind: text
Token {
normalised: "a",
original: "a",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -17,10 +19,14 @@ snapshot_kind: text
Token {
normalised: "b",
original: "b",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "c",
original: "c",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -29,6 +35,8 @@ snapshot_kind: text
Token {
normalised: "x",
original: "x",
is_left_joinable: true,
is_right_joinable: true,
},
],
),
@ -37,6 +45,8 @@ snapshot_kind: text
Token {
normalised: "d",
original: "d",
is_left_joinable: true,
is_right_joinable: true,
},
],
),

View file

@ -3,15 +3,12 @@ use core::iter;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use super::{CursorPosition, Operation, TextWithCursors};
use super::{CursorPosition, Operation, TextWithCursors, ordered_operation::OrderedOperation};
use crate::{
diffs::{myers::diff, raw_operation::RawOperation},
operation_transformation::merge_context::MergeContext,
tokenizer::{Tokenizer, word_tokenizer::word_tokenizer},
utils::{
merge_iters::MergeSorted as _, ordered_operation::OrderedOperation, side::Side,
string_builder::StringBuilder,
},
utils::{merge_iters::MergeSorted as _, side::Side, string_builder::StringBuilder},
};
/// A sequence of operations that can be applied to a text document.
@ -66,11 +63,93 @@ where
Self::new(
original,
Self::cook_operations(Self::elongate_operations(diff)).collect(),
Self::cook_operations(Self::elongate_operations(Self::break_up_raw_operations(
diff,
)))
.collect(),
updated.cursors,
)
}
fn break_up_raw_operations<I>(raw_operations: I) -> impl Iterator<Item = RawOperation<T>>
where
I: IntoIterator<Item = RawOperation<T>>,
{
raw_operations.into_iter().flat_map(|raw_operation| {
let mut result: Vec<RawOperation<T>> = Vec::new();
match raw_operation {
RawOperation::Insert(tokens) => {
for token in tokens {
result.push(RawOperation::Insert(vec![token]));
}
}
RawOperation::Delete(tokens) => {
for token in tokens {
result.push(RawOperation::Delete(vec![token]));
}
}
RawOperation::Equal(tokens) => {
for token in tokens {
result.push(RawOperation::Equal(vec![token]));
}
}
}
result.into_iter()
})
}
fn elongate_operations<I>(raw_operations: I) -> Vec<RawOperation<T>>
where
I: IntoIterator<Item = RawOperation<T>>,
{
let mut maybe_previous_insert: Option<RawOperation<T>> = None;
let mut maybe_previous_delete: Option<RawOperation<T>> = None;
let mut result: Vec<RawOperation<T>> = raw_operations
.into_iter()
.flat_map(|next| match next {
RawOperation::Insert(..) => match maybe_previous_insert.take() {
Some(prev) if prev.is_right_joinable() && next.is_left_joinable() => {
maybe_previous_insert = prev.extend(next);
Box::new(iter::empty()) as Box<dyn Iterator<Item = RawOperation<T>>>
}
prev => {
maybe_previous_insert = Some(next);
Box::new(prev.into_iter())
}
},
RawOperation::Delete(..) => match maybe_previous_delete.take() {
Some(prev) if prev.is_right_joinable() && next.is_left_joinable() => {
maybe_previous_delete = prev.extend(next);
Box::new(iter::empty()) as Box<dyn Iterator<Item = RawOperation<T>>>
}
prev => {
maybe_previous_delete = Some(next);
Box::new(prev.into_iter())
}
},
RawOperation::Equal(..) => Box::new(
maybe_previous_insert
.take()
.into_iter()
.chain(maybe_previous_delete.take())
.chain(iter::once(next)),
)
as Box<dyn Iterator<Item = RawOperation<T>>>,
})
.collect();
if let Some(prev) = maybe_previous_insert {
result.push(prev);
}
if let Some(prev) = maybe_previous_delete {
result.push(prev);
}
result
}
// Turn raw operations into ordered operations while keeping track of old & new
// indexes.
fn cook_operations<I>(raw_operations: I) -> impl Iterator<Item = OrderedOperation<T>>
@ -119,56 +198,6 @@ where
})
}
fn elongate_operations<I>(raw_operations: I) -> Vec<RawOperation<T>>
where
I: IntoIterator<Item = RawOperation<T>>,
{
let mut maybe_previous_insert: Option<RawOperation<T>> = None;
let mut maybe_previous_delete: Option<RawOperation<T>> = None;
let mut result: Vec<RawOperation<T>> = raw_operations
.into_iter()
.flat_map(|next| match next {
RawOperation::Insert(..) => {
if let Some(prev) = maybe_previous_insert.take() {
maybe_previous_insert = prev.extend(next);
} else {
maybe_previous_insert = Some(next);
}
Box::new(iter::empty()) as Box<dyn Iterator<Item = RawOperation<T>>>
}
RawOperation::Delete(..) => {
if let Some(prev) = maybe_previous_delete.take() {
maybe_previous_delete = prev.extend(next);
} else {
maybe_previous_delete = Some(next);
}
Box::new(iter::empty()) as Box<dyn Iterator<Item = RawOperation<T>>>
}
RawOperation::Equal(..) => Box::new(
maybe_previous_insert
.take()
.into_iter()
.chain(maybe_previous_delete.take())
.chain(iter::once(next)),
)
as Box<dyn Iterator<Item = RawOperation<T>>>,
})
.collect();
if let Some(prev) = maybe_previous_insert {
result.push(prev);
}
if let Some(prev) = maybe_previous_delete {
result.push(prev);
}
result
}
/// Create a new `EditedText` with the given operations.
/// The operations must be in the order in which they are meant to be
/// applied. The operations must not overlap.
@ -225,6 +254,7 @@ where
// Operations on the left and right must come in the same order so that
// inserts can be merged with other inserts and deletes with deletes.
usize::from(matches!(operation.operation, Operation::Delete { .. })),
operation.operation.start_index(),
// Make sure that the ordering is deterministic regardless which text
// is left or right.
match &operation.operation {

View file

@ -5,11 +5,21 @@ snapshot_kind: text
---
[
Token {
normalised: "what?",
original: " what?",
normalised: " what?",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "",
normalised: "what?",
original: "what?",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " ",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -4,20 +4,52 @@ expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: " hello,",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "hello,",
original: " hello,",
original: "hello,",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " \nwhere",
original: " \n",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "where",
original: " \nwhere",
original: "where",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " are",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "are",
original: " are",
original: "are",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " you?",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "you?",
original: " you?",
original: "you?",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -0,0 +1,39 @@
---
source: reconcile/src/tokenizer/word_tokenizer.rs
expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
snapshot_kind: text
---
[
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "hello,",
original: "hello,",
},
Token {
normalised: " \n",
original: " \n",
},
Token {
normalised: "where",
original: "where",
},
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "are",
original: "are",
},
Token {
normalised: " ",
original: " ",
},
Token {
normalised: "you?",
original: "you?",
},
]

View file

@ -7,9 +7,19 @@ snapshot_kind: text
Token {
normalised: "Hi",
original: "Hi",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: " there!",
original: " ",
is_left_joinable: true,
is_right_joinable: true,
},
Token {
normalised: "there!",
original: " there!",
original: "there!",
is_left_joinable: true,
is_right_joinable: true,
},
]

View file

@ -3,29 +3,45 @@ use serde::{Deserialize, Serialize};
/// A token is a string that has been normalised in some way.
/// The normalised form is used for comparison, while the original form is used
/// for applying Operations.
/// for applying `Operation`-s.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone)]
pub struct Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
normalised: T,
/// The normalised form of the token used deriving the diff.
pub normalised: T,
/// The original string, that should be inserted or deleted in the document.
original: String,
/// Whether the token is joinable with the previous token.
is_left_joinable: bool,
/// Whether the token is joinable with the next token.
is_right_joinable: bool,
}
impl From<&str> for Token<String> {
fn from(s: &str) -> Self { Token::new(s.trim().to_owned(), s.to_owned()) }
fn from(text: &str) -> Self { Token::new(text.to_owned(), text.to_owned(), true, true) }
}
impl<T> Token<T>
where
T: PartialEq + Clone + std::fmt::Debug,
{
pub fn new(normalised: T, original: String) -> Self {
pub fn new(
normalised: T,
original: String,
is_left_joinable: bool,
is_right_joinable: bool,
) -> Self {
Token {
normalised,
original,
is_left_joinable,
is_right_joinable,
}
}
@ -34,6 +50,10 @@ where
pub fn normalised(&self) -> &T { &self.normalised }
pub fn get_original_length(&self) -> usize { self.original.chars().count() }
pub fn get_is_left_joinable(&self) -> bool { self.is_left_joinable }
pub fn get_is_right_joinable(&self) -> bool { self.is_right_joinable }
}
impl<T> PartialEq for Token<T>

View file

@ -7,14 +7,59 @@ left: |
right: |
Hello there!
Best,
Andras
expected: |
Hello there!
How are you?
Best,
Andras
How are you?
---
parent: |
- my list
- 2nd item
- 3rd item
left: |
- my list
- 2nd item
- nested list
- very nested list
- 3rd item
right: |
- my list
- nested list
- 2nd item
- 3rd item
- another nested list
expected: |
- my list
- nested list
- 2nd item
- nested list
- very nested list
- 3rd item
- another nested list
---
parent: |
a
a
left: |
a
a
right: |
a
a
expected: |
a
a