Improve compact diff API (#24)

* Remove is_binary from API

* Format

* Rename file

* Test with more feature combinations

* Don't depend on serde for wasm

* Fix lint & tests

* Don't unwrap to MAX number

* Expose undiff to JS

* Add undiff tests

* Lint

* Change name
This commit is contained in:
Andras Schmelczer 2025-11-16 15:43:19 +00:00 committed by GitHub
parent 6191d1adb3
commit e85eb485e8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 430 additions and 424 deletions

20
Cargo.lock generated
View file

@ -124,12 +124,6 @@ version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "memchr"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]] [[package]]
name = "memory_units" name = "memory_units"
version = "0.4.0" version = "0.4.0"
@ -188,7 +182,6 @@ dependencies = [
"insta", "insta",
"pretty_assertions", "pretty_assertions",
"serde", "serde",
"serde_json",
"serde_yaml", "serde_yaml",
"test-case", "test-case",
"wasm-bindgen", "wasm-bindgen",
@ -247,19 +240,6 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]] [[package]]
name = "serde_yaml" name = "serde_yaml"
version = "0.9.34+deprecated" version = "0.9.34+deprecated"

View file

@ -24,7 +24,6 @@ path = "examples/merge-file.rs"
serde = { version = "1.0.219", optional = true, features = ["derive"] } serde = { version = "1.0.219", optional = true, features = ["derive"] }
wasm-bindgen = { version = "0.2.99", optional = true } wasm-bindgen = { version = "0.2.99", optional = true }
serde_json = { version = "1.0.145", optional = true }
# The `console_error_panic_hook` crate provides better debugging of panics by # The `console_error_panic_hook` crate provides better debugging of panics by
# logging them with `console.error`. This is great for development, but requires # logging them with `console.error`. This is great for development, but requires
@ -37,9 +36,9 @@ wee_alloc = { version = "0.4.2", optional = true }
[features] [features]
default = [] default = []
serde = [ "dep:serde" ] serde = [ "dep:serde" ]
wasm = [ "dep:wasm-bindgen", "dep:wee_alloc", "dep:serde_json", "serde" ] wasm = [ "dep:wasm-bindgen", "dep:wee_alloc" ]
console_error_panic_hook = [ "dep:console_error_panic_hook" ] console_error_panic_hook = [ "dep:console_error_panic_hook" ]
all = [ "wasm", "console_error_panic_hook" ] all = [ "wasm", "console_error_panic_hook", "serde" ]
[dev-dependencies] [dev-dependencies]
insta = "1.43.2" insta = "1.43.2"

View file

@ -23,7 +23,12 @@
<link rel="icon" type="image/x-icon" href="favicon.ico" /> <link rel="icon" type="image/x-icon" href="favicon.ico" />
<title>reconcile-text: conflict-free 3-way text merging</title> <title>reconcile-text: conflict-free 3-way text merging</title>
<link inline inline-asset="index.css" inline-asset-delete /> <link inline inline-asset="index.css" inline-asset-delete />
<script defer data-domain="reconcile" data-api="https://stats.schmelczer.dev/status" src="https://stats.schmelczer.dev/js/script.outbound-links.js"></script> <script
defer
data-domain="reconcile"
data-api="https://stats.schmelczer.dev/status"
src="https://stats.schmelczer.dev/js/script.outbound-links.js"
></script>
</head> </head>
<body> <body>
<div class="background"></div> <div class="background"></div>

View file

@ -1231,13 +1231,13 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "24.0.10", "version": "24.10.1",
"resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.10.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.1.tgz",
"integrity": "sha512-ENHwaH+JIRTDIEEbDK6QSQntAYGtbvdDXnMXnZaZ6k13Du1dPMmprkEHIL7ok2Wl2aZevetwTAb5S+7yIF+enA==", "integrity": "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ==",
"dev": true, "dev": true,
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"undici-types": "~7.8.0" "undici-types": "~7.16.0"
} }
}, },
"node_modules/@types/stack-utils": { "node_modules/@types/stack-utils": {
@ -5274,9 +5274,9 @@
} }
}, },
"node_modules/undici-types": { "node_modules/undici-types": {
"version": "7.8.0", "version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
"integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==", "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },

View file

@ -1,4 +1,9 @@
import { reconcile, reconcileWithHistory } from './index'; import { reconcile, reconcileWithHistory, diff, undiff } from './index';
import * as fs from 'fs';
import * as path from 'path';
import { fileURLToPath } from 'url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
describe('reconcile', () => { describe('reconcile', () => {
it('call reconcile without cursors', () => { it('call reconcile without cursors', () => {
@ -44,3 +49,35 @@ describe('reconcile', () => {
expect(result.history.length).toBeGreaterThan(0); expect(result.history.length).toBeGreaterThan(0);
}); });
}); });
describe('test_diff_and_undiff_are_inverse', () => {
const resourcesPath = path.join(__dirname, '../../tests/resources');
const readFileSlice = (fileName: string, start: number, end: number): string => {
const filePath = path.join(resourcesPath, fileName);
const content = fs.readFileSync(filePath, 'utf-8');
const chars = Array.from(content); // Handle unicode properly
return chars.slice(start, Math.min(end, chars.length)).join('');
};
const files = ['pride_and_prejudice.txt', 'room_with_a_view.txt', 'blns.txt'];
const ranges = [{ start: 0, end: 50000 }];
files.forEach((file1) => {
files.forEach((file2) => {
ranges.forEach((range1) => {
ranges.forEach((range2) => {
it(`should diff & undiff ${file1}[${range1.start}..${range1.end}], ${file2}[${range2.start}..${range2.end}] without panic`, () => {
const content1 = readFileSlice(file1, range1.start, range1.end);
const content2 = readFileSlice(file2, range2.start, range2.end);
const changes = diff(content1, content2);
const actual = undiff(content1, changes);
expect(actual).toEqual(content2);
});
});
});
});
});
});

View file

@ -4,8 +4,8 @@ import {
TextWithCursors as wasmTextWithCursors, TextWithCursors as wasmTextWithCursors,
SpanWithHistory as wasmSpanWithHistory, SpanWithHistory as wasmSpanWithHistory,
reconcileWithHistory as wasmReconcileWithHistory, reconcileWithHistory as wasmReconcileWithHistory,
isBinary as wasmIsBinary, diff as wasmDiff,
getCompactDiff as wasmGetCompactDiff, undiff as wasmUndiff,
initSync, initSync,
} from 'reconcile-text'; } from 'reconcile-text';
@ -183,22 +183,22 @@ export function reconcile(
/** /**
* Generates a compact diff representation between an original and changed text. * Generates a compact diff representation between an original and changed text.
* *
* These can be parsed and unpacked using Rust crate's EditedText::from_change_set. * These can be parsed and unpacked using the `undiff` function or the Rust crate's EditedText::from_diff.
* Cursor positions are omitted from the diff result.
* *
* This function computes the differences between two versions of text and returns * This function computes the differences between two versions of text and returns
* a compact string representation of those changes. The returned format is * a compact representation of those changes.
* serialised JSON.
* *
* @param original - The original/base version of the text * @param original - The original/base version of the text
* @param changed - The modified version of the text (either string or TextWithCursors with cursor positions) * @param changed - The modified version of the text (either string or TextWithCursors with cursor positions)
* @param tokenizer - The tokenisation strategy, which is the same as used in `reconcile`. * @param tokenizer - The tokenisation strategy, which is the same as used in `reconcile`.
* @returns A compact string representation of the diff between original and changed text * @returns An array representing the compact diff, with inserts as strings and deletes as negative integers.
*/ */
export function getCompactDiff( export function diff(
original: string, original: string,
changed: string | TextWithOptionalCursors, changed: string | TextWithOptionalCursors,
tokenizer: BuiltinTokenizer = 'Word' tokenizer: BuiltinTokenizer = 'Word'
): string { ): Array<number | string> {
init(); init();
if (!BUILTIN_TOKENIZERS.includes(tokenizer)) { if (!BUILTIN_TOKENIZERS.includes(tokenizer)) {
@ -207,13 +207,38 @@ export function getCompactDiff(
const changedWasm = toWasmTextWithCursors(changed); const changedWasm = toWasmTextWithCursors(changed);
const result = wasmGetCompactDiff(original, changedWasm, tokenizer); const result = wasmDiff(original, changedWasm, tokenizer);
changedWasm.free(); changedWasm.free();
return result; return result;
} }
/**
* Applies a compact diff to an original text to reconstruct the changed version.
*
* This function takes an original text and a compact diff representation (as produced
* by the `diff` function) and reconstructs the modified text.
*
* @param original - The original/base version of the text
* @param diff - The compact diff array representing changes (inserts as strings, deletes as negative integers)
* @param tokenizer - The tokenisation strategy, which is the same as used in `reconcile`.
* @returns The reconstructed changed text as a string.
*/
export function undiff(
original: string,
diff: Array<number | string>,
tokenizer: BuiltinTokenizer = 'Word'
): string {
init();
if (!BUILTIN_TOKENIZERS.includes(tokenizer)) {
throw new Error(UNSUPPORTED_TOKENIZER_ERROR);
}
return wasmUndiff(original, diff, tokenizer);
}
/** /**
* Merges three versions of text and returns detailed provenance information. * Merges three versions of text and returns detailed provenance information.
* *
@ -272,19 +297,6 @@ export function reconcileWithHistory(
}; };
} }
/**
* Check (using heuristics) if the given data is binary or text content.
*
* Only text inputs can be reconciled using the library's functions.
*
* @param data - The data to check for binary content. This should be a Uint8Array.
* @returns True if the data is likely binary, false if it is likely text.
*/
export function isBinary(data: Uint8Array): boolean {
init();
return wasmIsBinary(data);
}
function init() { function init() {
if (isInitialised) { if (isInitialised) {
return; return;

View file

@ -9,6 +9,5 @@
"declarationDir": "./dist/types", "declarationDir": "./dist/types",
"skipLibCheck": true, "skipLibCheck": true,
"inlineSourceMap": true "inlineSourceMap": true
}, }
"exclude": ["./dist", "**/*.test.ts"]
} }

View file

@ -4,7 +4,12 @@ set -e
wasm-pack build --target web --features wasm wasm-pack build --target web --features wasm
cargo test --verbose --features serde -- --include-ignored cargo test --verbose --features serde -- --include-ignored
cargo test --features serde,wasm
cargo test
cargo test --features serde
cargo test --features wasm
cargo test --features all
wasm-pack test --node --features wasm wasm-pack test --node --features wasm
cd reconcile-js cd reconcile-js

View file

@ -157,6 +157,8 @@
//! original text, making the size only depends on the changes made. //! original text, making the size only depends on the changes made.
//! //!
//! ```rust //! ```rust
//! # #[cfg(feature = "serde")]
//! # {
//! use reconcile_text::{EditedText, BuiltinTokenizer}; //! use reconcile_text::{EditedText, BuiltinTokenizer};
//! use serde_yaml; //! use serde_yaml;
//! use pretty_assertions::assert_eq; //! use pretty_assertions::assert_eq;
@ -170,20 +172,18 @@
//! &changes.into() //! &changes.into()
//! ); //! );
//! //!
//! let serialized = serde_yaml::to_string(&result.to_change_set()).unwrap(); //! let serialized = serde_yaml::to_string(&result.to_diff()).unwrap();
//! assert_eq!( //! assert_eq!(
//! serialized, //! serialized,
//! concat!( //! concat!(
//! "operations:\n",
//! "- 15\n", //! "- 15\n",
//! "- -6\n", //! "- -6\n",
//! "- ' easy with reconcile!'\n", //! "- ' easy with reconcile!'\n"
//! "cursors: []\n"
//! ) //! )
//! ); //! );
//! //!
//! let deserialized = serde_yaml::from_str(&serialized).unwrap(); //! let deserialized = serde_yaml::from_str(&serialized).unwrap();
//! let reconstructed = EditedText::from_change_set( //! let reconstructed = EditedText::from_diff(
//! original, //! original,
//! deserialized, //! deserialized,
//! &*BuiltinTokenizer::Word //! &*BuiltinTokenizer::Word
@ -192,13 +192,17 @@
//! reconstructed.apply().text(), //! reconstructed.apply().text(),
//! "Merging text is easy with reconcile!" //! "Merging text is easy with reconcile!"
//! ); //! );
//! # }
//! ``` //! ```
//! //!
//! ## Error handling //! ## Error handling
//! //!
//! The library is designed to be robust and will always produce a result, even //! The library is designed to be robust and will always produce a result, even
//! in edge cases. However, be aware that extremely large diffs may have //! for edge cases.
//! performance implications. //!
//! ## Performance
//!
//! Be aware that extremely large diffs may have performance implications.
//! //!
//! ## Algorithm overview //! ## Algorithm overview
//! //!
@ -211,13 +215,12 @@ mod tokenizer;
mod types; mod types;
mod utils; mod utils;
pub use operation_transformation::{ChangeSet, EditedText, reconcile}; pub use operation_transformation::{EditedText, reconcile};
pub use tokenizer::{BuiltinTokenizer, Tokenizer, token::Token}; pub use tokenizer::{BuiltinTokenizer, Tokenizer, token::Token};
pub use types::{ pub use types::{
cursor_position::CursorPosition, history::History, side::Side, cursor_position::CursorPosition, history::History, number_or_string::NumberOrString,
span_with_history::SpanWithHistory, text_with_cursors::TextWithCursors, side::Side, span_with_history::SpanWithHistory, text_with_cursors::TextWithCursors,
}; };
pub use utils::is_binary::is_binary;
#[cfg(feature = "wasm")] #[cfg(feature = "wasm")]
pub mod wasm; pub mod wasm;

View file

@ -1,12 +1,10 @@
mod edited_text; mod edited_text;
mod operation; mod operation;
mod transport;
mod utils; mod utils;
use std::fmt::Debug; use std::fmt::Debug;
pub use edited_text::EditedText; pub use edited_text::EditedText;
pub use operation::Operation; pub use operation::Operation;
pub use transport::ChangeSet;
use crate::{Tokenizer, types::text_with_cursors::TextWithCursors}; use crate::{Tokenizer, types::text_with_cursors::TextWithCursors};

View file

@ -4,15 +4,17 @@ use std::{fmt::Debug, vec};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::{ use crate::{
BuiltinTokenizer, ChangeSet, CursorPosition, TextWithCursors, BuiltinTokenizer, CursorPosition, TextWithCursors,
operation_transformation::{ operation_transformation::{
Operation, Operation,
transport::SimpleOperation,
utils::{cook_operations::cook_operations, elongate_operations::elongate_operations}, utils::{cook_operations::cook_operations, elongate_operations::elongate_operations},
}, },
raw_operation::RawOperation, raw_operation::RawOperation,
tokenizer::Tokenizer, tokenizer::Tokenizer,
types::{history::History, side::Side, span_with_history::SpanWithHistory}, types::{
history::History, number_or_string::NumberOrString, side::Side,
span_with_history::SpanWithHistory,
},
utils::string_builder::StringBuilder, utils::string_builder::StringBuilder,
}; };
@ -105,6 +107,11 @@ where
/// from the same original text. The operations are merged using the /// from the same original text. The operations are merged using the
/// principles of Operational Transformation. The cursors are updated /// principles of Operational Transformation. The cursors are updated
/// accordingly to reflect the changes made by the merged operations. /// accordingly to reflect the changes made by the merged operations.
///
/// # Panics
///
/// Panics if there's an integer overflow (in i64) when calculating new
/// cursor positions.
#[must_use] #[must_use]
#[allow(clippy::too_many_lines)] #[allow(clippy::too_many_lines)]
pub fn merge(self, other: Self) -> Self { pub fn merge(self, other: Self) -> Self {
@ -166,13 +173,14 @@ where
let result = operation.merge_operations(&mut last_other_op); let result = operation.merge_operations(&mut last_other_op);
if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
let merged_length_signed = let merged_length_signed = isize::try_from(merged_length)
isize::try_from(merged_length).unwrap_or(isize::MAX); .expect("merged_length must fit in isize");
let seen_left_length_signed = let seen_left_length_signed = isize::try_from(seen_left_length)
isize::try_from(seen_left_length).unwrap_or(isize::MAX); .expect("seen_left_length must fit in isize");
let op_len_signed = isize::try_from(op.len()).unwrap_or(isize::MAX); let op_len_signed =
let original_length_signed = isize::try_from(op.len()).expect("op.len() must fit in isize");
isize::try_from(original_length).unwrap_or(isize::MAX); let original_length_signed = isize::try_from(original_length)
.expect("original_length must fit in isize");
let shift = merged_length_signed - seen_left_length_signed + op_len_signed let shift = merged_length_signed - seen_left_length_signed + op_len_signed
- original_length_signed; - original_length_signed;
@ -199,13 +207,14 @@ where
let result = operation.merge_operations(&mut last_other_op); let result = operation.merge_operations(&mut last_other_op);
if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result {
let merged_length_signed = let merged_length_signed = isize::try_from(merged_length)
isize::try_from(merged_length).unwrap_or(isize::MAX); .expect("merged_length must fit in isize");
let seen_right_length_signed = let seen_right_length_signed = isize::try_from(seen_right_length)
isize::try_from(seen_right_length).unwrap_or(isize::MAX); .expect("seen_right_length must fit in isize");
let op_len_signed = isize::try_from(op.len()).unwrap_or(isize::MAX); let op_len_signed =
let original_length_signed = isize::try_from(op.len()).expect("op.len() must fit in isize");
isize::try_from(original_length).unwrap_or(isize::MAX); let original_length_signed = isize::try_from(original_length)
.expect("original_length must fit in isize");
let shift = merged_length_signed - seen_right_length_signed + op_len_signed let shift = merged_length_signed - seen_right_length_signed + op_len_signed
- original_length_signed; - original_length_signed;
@ -345,34 +354,122 @@ where
history history
} }
/// Serialize the `EditedText` as a `ChangeSet`, which contains only /// Convert the `EditedText` into a terse representation ready for
/// the operations and cursor positions, but without the original text. /// serialization. The result omits cursor positions and the original text.
/// This is useful for sending changes over the network if there's /// This is useful for sending text diffs over the network if there's a
/// a clear consensus on the original text. /// clear consensus on the original text.
///
/// Inserts are represented as strings, deletes as negative integers,
/// and equal spans as positive integers.
///
/// # Panics
///
/// Panics if there's an integer overflow in i64.
#[must_use] #[must_use]
pub fn to_change_set(&self) -> ChangeSet { pub fn to_diff(&self) -> Vec<NumberOrString> {
ChangeSet::new( let mut result: Vec<NumberOrString> = Vec::with_capacity(self.operations.len());
SimpleOperation::from_operations(&self.operations), let mut previous_equal: Option<usize> = None;
self.cursors.clone(),
) for operation in &self.operations {
match operation {
Operation::Equal { length, .. } => {
if let Some(prev_length) = previous_equal {
previous_equal = Some(prev_length + *length);
} else {
previous_equal = Some(*length);
}
} }
/// Deserialize an `EditedText` from a `ChangeSet` and the original text. Operation::Insert { text, .. } => {
/// This is useful for reconstructing the `EditedText` on the receiving if let Some(prev_length) = previous_equal {
/// end after sending only the `ChangeSet` over the network. result.push(NumberOrString::Number(
i64::try_from(prev_length).expect("prev_length must fit in i64"),
));
previous_equal = None;
}
let text: String = text
.iter()
.map(super::super::tokenizer::token::Token::original)
.collect();
result.push(NumberOrString::Text(text));
}
Operation::Delete {
deleted_character_count,
..
} => {
if let Some(prev_length) = previous_equal {
result.push(NumberOrString::Number(
i64::try_from(prev_length).expect("prev_length must fit in i64"),
));
previous_equal = None;
}
let count = i64::try_from(*deleted_character_count)
.expect("deleted_character_count must fit in i64");
result.push(NumberOrString::Number(-count));
}
}
}
if let Some(prev_length) = previous_equal {
result.push(NumberOrString::Number(
i64::try_from(prev_length).expect("prev_length must fit in i64"),
));
}
result
}
/// Deserialize an `EditedText` from a change list and the original text.
///
/// # Panics
///
/// Panics if there's an integer overflow in i64.
#[must_use] #[must_use]
pub fn from_change_set( pub fn from_diff(
text: &'a str, original_text: &'a str,
change_set: ChangeSet, diff: Vec<NumberOrString>,
tokenizer: &Tokenizer<T>, tokenizer: &Tokenizer<T>,
) -> EditedText<'a, T> { ) -> EditedText<'a, T> {
let operations = SimpleOperation::to_operations(change_set.operations, text, tokenizer); let mut operations: Vec<Operation<T>> = Vec::with_capacity(diff.len());
let mut order = 0;
for item in diff {
match item {
NumberOrString::Number(length) => {
if length >= 0 {
let length = usize::try_from(length).expect("length must fit in usize");
let original_characters: String =
original_text.chars().skip(order).take(length).collect();
let original_tokens = tokenizer(&original_characters);
for token in original_tokens {
operations
.push(Operation::create_equal(order, token.get_original_length()));
order += token.get_original_length();
}
} else {
let length =
usize::try_from(-length).expect("negative length must fit in usize");
operations.push(Operation::create_delete(order, length));
order += length;
}
}
NumberOrString::Text(text) => {
let tokens = tokenizer(&text);
operations.push(Operation::create_insert(order, tokens));
}
}
}
let operation_count = operations.len(); let operation_count = operations.len();
EditedText::new( EditedText::new(
text, original_text,
operations, operations,
vec![Side::Left; operation_count], vec![Side::Left; operation_count],
change_set.cursors, vec![],
) )
} }
} }
@ -423,34 +520,29 @@ mod tests {
assert_eq!(operations.apply().text(), expected); assert_eq!(operations.apply().text(), expected);
} }
#[cfg(feature = "serde")]
#[test] #[test]
fn test_change_set_deserialisation() { fn test_changes_deserialisation() {
let original = "Merging text is hard!"; let original = "Merging text is hard!";
let changes = "Merging text is easy with reconcile!"; let changes = "Merging text is easy with reconcile!";
let result = EditedText::from_strings(original, &changes.into()); let result = EditedText::from_strings(original, &changes.into());
let serialized = serde_yaml::to_string(&result.to_change_set()).unwrap(); let serialized = serde_yaml::to_string(&result.to_diff()).unwrap();
let expected = concat!(
"operations:\n",
"- 15\n",
"- -6\n",
"- ' easy with reconcile!'\n",
"cursors: []\n"
);
let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",);
assert_eq!(serialized, expected); assert_eq!(serialized, expected);
} }
#[cfg(feature = "serde")]
#[test] #[test]
fn test_change_set_serialization() { fn test_changes_serialization() {
let original = "The quick brown fox jumps over the lazy dog."; let original = "The quick brown fox jumps over the lazy dog.";
let updated = "The quick red fox jumped over the very lazy dog!"; let updated = "The quick red fox jumped over the very lazy dog!";
let edited_text = EditedText::from_strings(original, &updated.into()); let edited_text = EditedText::from_strings(original, &updated.into());
let change_set = edited_text.to_change_set(); let changes = edited_text.to_diff();
let deserialized_edited_text = let deserialized_edited_text =
EditedText::from_change_set(original, change_set, &*BuiltinTokenizer::Word); EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word);
assert_eq!(deserialized_edited_text.apply().text(), updated); assert_eq!(deserialized_edited_text.apply().text(), updated);
} }

View file

@ -1,204 +0,0 @@
use std::fmt::Debug;
#[cfg(feature = "serde")]
use serde::{
Deserialize, Serialize,
de::{self, Deserializer, Visitor},
ser::Serializer,
};
use crate::{CursorPosition, Tokenizer, operation_transformation::Operation};
#[derive(Clone, PartialEq, Eq, Debug)]
pub enum SimpleOperation {
Equal { length: usize },
Insert { text: String },
Delete { length: usize },
}
impl SimpleOperation {
pub fn from_operations<T>(operation: &Vec<Operation<T>>) -> Vec<Self>
where
T: PartialEq + Clone + Debug,
{
let mut result: Vec<Self> = Vec::with_capacity(operation.len());
let mut previous_equal: Option<usize> = None;
for operation in operation {
match operation {
Operation::Equal { length, .. } => {
if let Some(prev_length) = previous_equal {
previous_equal = Some(prev_length + *length);
} else {
previous_equal = Some(*length);
}
}
Operation::Insert { text, .. } => {
if let Some(prev_length) = previous_equal {
result.push(SimpleOperation::Equal {
length: prev_length,
});
previous_equal = None;
}
let text: String = text
.iter()
.map(super::super::tokenizer::token::Token::original)
.collect();
result.push(SimpleOperation::Insert { text });
}
Operation::Delete {
deleted_character_count,
..
} => {
if let Some(prev_length) = previous_equal {
result.push(SimpleOperation::Equal {
length: prev_length,
});
previous_equal = None;
}
result.push(SimpleOperation::Delete {
length: *deleted_character_count,
});
}
}
}
if let Some(prev_length) = previous_equal {
result.push(SimpleOperation::Equal {
length: prev_length,
});
}
result
}
// This is similar to `crate::operation_transformation::utils::cook_operations`
pub fn to_operations<T>(
simple_operations: Vec<Self>,
original_text: &str,
tokenizer: &Tokenizer<T>,
) -> Vec<Operation<T>>
where
T: PartialEq + Clone + Debug,
{
let mut operations: Vec<Operation<T>> = Vec::with_capacity(simple_operations.len());
let mut order = 0;
for simple_operation in simple_operations {
match simple_operation {
SimpleOperation::Equal { length } => {
let original_characters: String =
original_text.chars().skip(order).take(length).collect();
let original_tokens = tokenizer(&original_characters);
for token in original_tokens {
operations
.push(Operation::create_equal(order, token.get_original_length()));
order += token.get_original_length();
}
}
SimpleOperation::Insert { text } => {
let tokens = tokenizer(&text);
operations.push(Operation::create_insert(order, tokens));
}
SimpleOperation::Delete { length } => {
operations.push(Operation::create_delete(order, length));
order += length;
}
}
}
operations
}
}
#[cfg(feature = "serde")]
impl Serialize for SimpleOperation {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
// neat idea from https://github.com/spebern/operational-transform-rs/blob/9faa17f0a2b282ac2e09dbb2d29fdaf2ae0bbb4a/operational-transform/src/serde.rs#L14
match self {
SimpleOperation::Equal { length } => serializer.serialize_u64(*length as u64),
SimpleOperation::Insert { text } => serializer.serialize_str(text),
SimpleOperation::Delete { length } => {
serializer.serialize_i64(-(i64::try_from(*length).unwrap_or(i64::MAX)))
}
}
}
}
#[cfg(feature = "serde")]
impl<'de> Deserialize<'de> for SimpleOperation {
fn deserialize<D>(deserializer: D) -> Result<SimpleOperation, D::Error>
where
D: Deserializer<'de>,
{
use std::fmt;
struct OperationVisitor;
impl Visitor<'_> for OperationVisitor {
type Value = SimpleOperation;
fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
formatter.write_str("an integer between -2^63 and 2^64-1 or a string")
}
fn visit_u64<E>(self, value: u64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(SimpleOperation::Equal {
length: usize::try_from(value).unwrap_or(usize::MAX),
})
}
fn visit_i64<E>(self, value: i64) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(SimpleOperation::Delete {
length: usize::try_from(-value).unwrap_or(usize::MAX),
})
}
fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
where
E: de::Error,
{
Ok(SimpleOperation::Insert {
text: value.to_owned(),
})
}
}
deserializer.deserialize_any(OperationVisitor)
}
}
/// A serializable representation of the changes made to a text document
/// without the original text.
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Default)]
pub struct ChangeSet {
pub operations: Vec<SimpleOperation>,
pub cursors: Vec<CursorPosition>,
}
impl ChangeSet {
#[must_use]
pub fn new(operations: Vec<SimpleOperation>, cursors: Vec<CursorPosition>) -> Self {
Self {
operations,
cursors,
}
}
}

View file

@ -1,5 +1,6 @@
pub mod cursor_position; pub mod cursor_position;
pub mod history; pub mod history;
pub mod number_or_string;
pub mod side; pub mod side;
pub mod span_with_history; pub mod span_with_history;
pub mod text_with_cursors; pub mod text_with_cursors;

View file

@ -0,0 +1,74 @@
use std::fmt::Debug;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
#[cfg(feature = "wasm")]
use wasm_bindgen::prelude::*;
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde", serde(untagged))]
#[derive(Debug, Clone, PartialEq)]
pub enum NumberOrString {
Number(i64),
Text(String),
}
#[cfg(feature = "wasm")]
impl TryFrom<JsValue> for NumberOrString {
type Error = DeserialisationError;
fn try_from(value: JsValue) -> Result<Self, Self::Error> {
if let Ok(num) = value.clone().try_into() {
return Ok(NumberOrString::Number(num));
}
if let Ok(text) = value.try_into() {
return Ok(NumberOrString::Text(text));
}
Err(DeserialisationError::new(
"Could not parse JsValue as either number or string",
))
}
}
#[cfg(feature = "wasm")]
impl From<NumberOrString> for JsValue {
fn from(value: NumberOrString) -> Self {
match value {
NumberOrString::Number(num) => JsValue::from(num),
NumberOrString::Text(text) => JsValue::from(text),
}
}
}
/// Error type for deserialisation failures
#[cfg(feature = "wasm")]
#[derive(Debug, Clone)]
pub struct DeserialisationError {
pub message: String,
}
#[cfg(feature = "wasm")]
impl DeserialisationError {
pub fn new(message: impl Into<String>) -> Self {
Self {
message: message.into(),
}
}
}
#[cfg(feature = "wasm")]
impl std::fmt::Display for DeserialisationError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Deserialisation error: {}", self.message)
}
}
#[cfg(feature = "wasm")]
impl std::error::Error for DeserialisationError {}
#[cfg(feature = "wasm")]
impl From<DeserialisationError> for JsValue {
fn from(error: DeserialisationError) -> Self { JsValue::from_str(&error.message) }
}

View file

@ -1,6 +1,5 @@
pub mod common_prefix_len; pub mod common_prefix_len;
pub mod common_suffix_len; pub mod common_suffix_len;
pub mod find_longest_prefix_contained_within; pub mod find_longest_prefix_contained_within;
pub mod is_binary;
pub mod myers_diff; pub mod myers_diff;
pub mod string_builder; pub mod string_builder;

View file

@ -1,26 +0,0 @@
/// Heuristically determine if the given data is a binary or a text file's
/// content.
///
/// Only text inputs can be reconciled using the crate's functions.
#[must_use]
pub fn is_binary(data: &[u8]) -> bool {
if data.contains(&0) {
// Even though the NUL character is valid in UTF-8, it's highly suspicious in
// human-readable text.
return true;
}
std::str::from_utf8(data).is_err()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_binary() {
assert!(is_binary(&[0, 159, 146, 150]));
assert!(is_binary(&[0, 12]));
assert!(!is_binary(b"hello"));
}
}

View file

@ -87,7 +87,7 @@ struct V {
impl V { impl V {
fn new(max_d: usize) -> Self { fn new(max_d: usize) -> Self {
// max_d should fit in isize for the algorithm to work correctly // max_d should fit in isize for the algorithm to work correctly
let offset = isize::try_from(max_d).unwrap_or(isize::MAX); let offset = isize::try_from(max_d).expect("max_d must fit in isize");
Self { Self {
offset, offset,
v: vec![0; 2 * max_d], v: vec![0; 2 * max_d],
@ -101,16 +101,15 @@ impl Index<isize> for V {
type Output = usize; type Output = usize;
fn index(&self, index: isize) -> &Self::Output { fn index(&self, index: isize) -> &Self::Output {
let idx = usize::try_from(index + self.offset).unwrap_or(usize::MAX); let idx = usize::try_from(index + self.offset).expect("index + offset must fit in usize");
&self.v[idx.min(self.v.len().saturating_sub(1))] &self.v[idx]
} }
} }
impl IndexMut<isize> for V { impl IndexMut<isize> for V {
fn index_mut(&mut self, index: isize) -> &mut Self::Output { fn index_mut(&mut self, index: isize) -> &mut Self::Output {
let idx = usize::try_from(index + self.offset).unwrap_or(usize::MAX); let idx = usize::try_from(index + self.offset).expect("index + offset must fit in usize");
let len = self.v.len(); &mut self.v[idx]
&mut self.v[idx.min(len.saturating_sub(1))]
} }
} }
@ -145,7 +144,8 @@ where
// By Lemma 1 in the paper, the optimal edit script length is odd or even as // By Lemma 1 in the paper, the optimal edit script length is odd or even as
// `delta` is odd or even. // `delta` is odd or even.
let delta = isize::try_from(n).unwrap_or(isize::MAX) - isize::try_from(m).unwrap_or(isize::MAX); let delta = isize::try_from(n).expect("n must fit in isize")
- isize::try_from(m).expect("m must fit in isize");
let odd = delta & 1 == 1; let odd = delta & 1 == 1;
// The initial point at (0, -1) // The initial point at (0, -1)
@ -157,7 +157,7 @@ where
assert!(vf.len() >= d_max); assert!(vf.len() >= d_max);
assert!(vb.len() >= d_max); assert!(vb.len() >= d_max);
let d_max_isize = isize::try_from(d_max).unwrap_or(isize::MAX); let d_max_isize = isize::try_from(d_max).expect("d_max must fit in isize");
for d in 0..d_max_isize { for d in 0..d_max_isize {
// Forward path // Forward path
for k in (-d..=d).rev().step_by(2) { for k in (-d..=d).rev().step_by(2) {
@ -166,7 +166,8 @@ where
} else { } else {
vf[k - 1] + 1 vf[k - 1] + 1
}; };
let y = usize::try_from(isize::try_from(x).unwrap_or(isize::MAX) - k).unwrap_or(0); let y = usize::try_from(isize::try_from(x).expect("x must fit in isize") - k)
.expect("x - k must be non-negative and fit in usize");
// The coordinate of the start of a snake // The coordinate of the start of a snake
let (x0, y0) = (x, y); let (x0, y0) = (x, y);
@ -204,7 +205,8 @@ where
} else { } else {
vb[k - 1] + 1 vb[k - 1] + 1
}; };
let mut y = usize::try_from(isize::try_from(x).unwrap_or(isize::MAX) - k).unwrap_or(0); let mut y = usize::try_from(isize::try_from(x).expect("x must fit in isize") - k)
.expect("x - k must be non-negative and fit in usize");
// The coordinate of the start of a snake // The coordinate of the start of a snake
if x < n && y < m { if x < n && y < m {

View file

@ -3,7 +3,7 @@ use core::str;
use wasm_bindgen::prelude::*; use wasm_bindgen::prelude::*;
use crate::{BuiltinTokenizer, CursorPosition, SpanWithHistory, TextWithCursors}; use crate::{BuiltinTokenizer, CursorPosition, EditedText, SpanWithHistory, TextWithCursors};
#[global_allocator] #[global_allocator]
static ALLOC: wee_alloc::WeeAlloc<'_> = wee_alloc::WeeAlloc::INIT; static ALLOC: wee_alloc::WeeAlloc<'_> = wee_alloc::WeeAlloc::INIT;
@ -32,6 +32,7 @@ pub fn reconcile_with_history(
tokenizer: BuiltinTokenizer, tokenizer: BuiltinTokenizer,
) -> TextWithCursorsAndHistory { ) -> TextWithCursorsAndHistory {
set_panic_hook(); set_panic_hook();
let reconciled = crate::reconcile(parent, left, right, &*tokenizer); let reconciled = crate::reconcile(parent, left, right, &*tokenizer);
let text_with_cursors = reconciled.apply(); let text_with_cursors = reconciled.apply();
@ -54,10 +55,6 @@ pub fn reconcile_with_history(
/// # Returns /// # Returns
/// ///
/// The merged document. /// The merged document.
///
/// # Panics
///
/// If any of the input documents are not valid UTF-8 strings.
#[wasm_bindgen(js_name = genericReconcile)] #[wasm_bindgen(js_name = genericReconcile)]
#[must_use] #[must_use]
pub fn generic_reconcile( pub fn generic_reconcile(
@ -68,51 +65,56 @@ pub fn generic_reconcile(
) -> Vec<u8> { ) -> Vec<u8> {
set_panic_hook(); set_panic_hook();
if crate::is_binary(parent) || crate::is_binary(left) || crate::is_binary(right) { if let (Some(parent), Some(left), Some(right)) = (
right.to_vec() string_or_nothing(parent),
string_or_nothing(left),
string_or_nothing(right),
) {
crate::reconcile(&parent, &left.into(), &right.into(), &*tokenizer)
.apply()
.text()
.into_bytes()
} else { } else {
crate::reconcile( right.to_vec()
str::from_utf8(parent).expect("parent must be valid UTF-8 because it's not binary"), }
&str::from_utf8(left) }
.expect("left must be valid UTF-8 because it's not binary")
.into(), /// WASM wrapper around getting a compact diff representation of two texts as a
&str::from_utf8(right) /// list of numbers and strings.
.expect("right must be valid UTF-8 because it's not binary") #[wasm_bindgen(js_name = diff)]
.into(), #[must_use]
pub fn diff(parent: &str, changed: &TextWithCursors, tokenizer: BuiltinTokenizer) -> Vec<JsValue> {
set_panic_hook();
let edited_text = EditedText::from_strings_with_tokenizer(parent, changed, &*tokenizer);
edited_text
.to_diff()
.into_iter()
.map(std::convert::Into::into)
.collect()
}
/// Inverse of `diff`, applies a compact diff representation to a parent text
///
/// # Panics
///
/// Panics if the diff format is invalid or there's an integer overflow when
/// applying the diff.
#[wasm_bindgen(js_name = undiff)]
#[must_use]
pub fn undiff(parent: &str, diff: Vec<JsValue>, tokenizer: BuiltinTokenizer) -> String {
set_panic_hook();
EditedText::from_diff(
parent,
diff.into_iter()
.map(std::convert::TryInto::try_into)
.collect::<Result<_, _>>()
.expect("Invalid diff format"),
&*tokenizer, &*tokenizer,
) )
.apply() .apply()
.text() .text()
.into_bytes()
}
}
/// WASM wrapper around getting a compact diff representation as a JSON string
///
/// # Panics
///
/// If serialization to JSON fails which should not happen
#[wasm_bindgen(js_name = getCompactDiff)]
#[must_use]
pub fn get_compact_diff(
parent: &str,
changed: &TextWithCursors,
tokenizer: BuiltinTokenizer,
) -> String {
set_panic_hook();
let edited_text = crate::EditedText::from_strings_with_tokenizer(parent, changed, &*tokenizer);
let change_set = edited_text.to_change_set();
serde_json::to_string(&change_set).expect("Failed to serialize change set")
}
/// Heuristically determine if the given data is a binary or a text file's
/// content.
#[wasm_bindgen(js_name = isBinary)]
#[must_use]
pub fn is_binary(data: &[u8]) -> bool {
set_panic_hook();
crate::is_binary(data)
} }
fn set_panic_hook() { fn set_panic_hook() {
@ -140,3 +142,30 @@ impl TextWithCursorsAndHistory {
#[must_use] #[must_use]
pub fn history(&self) -> Vec<SpanWithHistory> { self.history.clone() } pub fn history(&self) -> Vec<SpanWithHistory> { self.history.clone() }
} }
/// Returns the UTF8 parsed string if it's a text, or `None` if it's likely
/// binary.
#[must_use]
fn string_or_nothing(data: &[u8]) -> Option<String> {
if data.contains(&0) {
// Even though the NUL character is valid in UTF-8, it's highly suspicious in
// human-readable text.
return None;
}
std::str::from_utf8(data)
.map(std::borrow::ToOwned::to_owned)
.ok()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_string_or_nothing() {
assert_eq!(string_or_nothing(&[0, 159, 146, 150]), None);
assert_eq!(string_or_nothing(&[0, 12]), None);
assert_eq!(string_or_nothing(b"hello"), Some("hello".into()));
}
}

View file

@ -3,7 +3,7 @@ mod example_document;
use std::{fs, path::Path}; use std::{fs, path::Path};
use example_document::ExampleDocument; use example_document::ExampleDocument;
use reconcile_text::{BuiltinTokenizer, EditedText, reconcile}; use reconcile_text::{BuiltinTokenizer, reconcile};
use serde::Deserialize; use serde::Deserialize;
#[test] #[test]
@ -34,8 +34,11 @@ fn test_document_one_way_with_cursors() {
} }
} }
#[cfg(feature = "serde")]
#[test] #[test]
fn test_document_one_way_with_cursors_and_serialisation() { fn test_document_one_way_with_serialisation() {
use reconcile_text::EditedText;
for doc in &get_all_documents() { for doc in &get_all_documents() {
let parent = doc.parent(); let parent = doc.parent();
let left_operations = let left_operations =
@ -47,19 +50,23 @@ fn test_document_one_way_with_cursors_and_serialisation() {
); );
let serialised_left = let serialised_left =
serde_yaml::from_str(&serde_yaml::to_string(&left_operations.to_change_set()).unwrap()) serde_yaml::from_str(&serde_yaml::to_string(&left_operations.to_diff()).unwrap())
.unwrap(); .unwrap();
let serialised_right = serde_yaml::from_str( let serialised_right =
&serde_yaml::to_string(&right_operations.to_change_set()).unwrap(), serde_yaml::from_str(&serde_yaml::to_string(&right_operations.to_diff()).unwrap())
)
.unwrap(); .unwrap();
let restored_left_operations = let restored_left_operations =
EditedText::from_change_set(&parent, serialised_left, &*BuiltinTokenizer::Word); EditedText::from_diff(&parent, serialised_left, &*BuiltinTokenizer::Word);
let restored_right_operations = let restored_right_operations =
EditedText::from_change_set(&parent, serialised_right, &*BuiltinTokenizer::Word); EditedText::from_diff(&parent, serialised_right, &*BuiltinTokenizer::Word);
doc.assert_eq(&restored_left_operations.merge(restored_right_operations)); doc.assert_eq_without_cursors(
&restored_left_operations
.merge(restored_right_operations)
.apply()
.text(),
);
} }
} }

View file

@ -55,22 +55,16 @@ fn test_merge_binary() {
); );
} }
#[wasm_bindgen_test(unsupported = test)] #[wasm_bindgen_test] // JsValue isn't supported outside of wasm
fn test_is_binary() { fn test_diff() {
assert!(is_binary(&[0, 159, 146, 150]));
assert!(is_binary(&[0, 12]));
assert!(!is_binary(b"hello"));
}
#[wasm_bindgen_test(unsupported = test)]
fn test_get_compact_diff() {
let parent = "hello "; let parent = "hello ";
let changed = "world"; let changed = "world";
let result = get_compact_diff(parent, &changed.into(), BuiltinTokenizer::Word);
assert_eq!(result, "{\"operations\":[-6,\"world\"],\"cursors\":[]}");
}
#[wasm_bindgen_test(unsupported = test)] let result = diff(parent, &changed.into(), BuiltinTokenizer::Word);
fn test_is_binary_empty() {
assert!(!is_binary(b"")); assert_eq!(result.len(), 2);
let first: i64 = result[0].clone().try_into().unwrap();
let second: String = result[1].clone().try_into().unwrap();
assert_eq!(first, -6);
assert_eq!(second, "world");
} }