diff --git a/Cargo.lock b/Cargo.lock index b739f33..5e187e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -124,12 +124,6 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" -[[package]] -name = "memchr" -version = "2.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" - [[package]] name = "memory_units" version = "0.4.0" @@ -188,7 +182,6 @@ dependencies = [ "insta", "pretty_assertions", "serde", - "serde_json", "serde_yaml", "test-case", "wasm-bindgen", @@ -247,19 +240,6 @@ dependencies = [ "syn", ] -[[package]] -name = "serde_json" -version = "1.0.145" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", - "serde_core", -] - [[package]] name = "serde_yaml" version = "0.9.34+deprecated" diff --git a/Cargo.toml b/Cargo.toml index f960633..74820a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ path = "examples/merge-file.rs" serde = { version = "1.0.219", optional = true, features = ["derive"] } wasm-bindgen = { version = "0.2.99", optional = true } -serde_json = { version = "1.0.145", optional = true } # The `console_error_panic_hook` crate provides better debugging of panics by # logging them with `console.error`. This is great for development, but requires @@ -37,9 +36,9 @@ wee_alloc = { version = "0.4.2", optional = true } [features] default = [] serde = [ "dep:serde" ] -wasm = [ "dep:wasm-bindgen", "dep:wee_alloc", "dep:serde_json", "serde" ] +wasm = [ "dep:wasm-bindgen", "dep:wee_alloc" ] console_error_panic_hook = [ "dep:console_error_panic_hook" ] -all = [ "wasm", "console_error_panic_hook" ] +all = [ "wasm", "console_error_panic_hook", "serde" ] [dev-dependencies] insta = "1.43.2" diff --git a/examples/website/src/index.html b/examples/website/src/index.html index dee8c53..71d5cbc 100644 --- a/examples/website/src/index.html +++ b/examples/website/src/index.html @@ -23,7 +23,12 @@ reconcile-text: conflict-free 3-way text merging - +
diff --git a/reconcile-js/package-lock.json b/reconcile-js/package-lock.json index 573aba2..38dcb9c 100644 --- a/reconcile-js/package-lock.json +++ b/reconcile-js/package-lock.json @@ -1231,13 +1231,13 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "24.0.10", - "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.10.tgz", - "integrity": "sha512-ENHwaH+JIRTDIEEbDK6QSQntAYGtbvdDXnMXnZaZ6k13Du1dPMmprkEHIL7ok2Wl2aZevetwTAb5S+7yIF+enA==", + "version": "24.10.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.1.tgz", + "integrity": "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ==", "dev": true, "license": "MIT", "dependencies": { - "undici-types": "~7.8.0" + "undici-types": "~7.16.0" } }, "node_modules/@types/stack-utils": { @@ -5274,9 +5274,9 @@ } }, "node_modules/undici-types": { - "version": "7.8.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz", - "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==", + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", + "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", "dev": true, "license": "MIT" }, diff --git a/reconcile-js/src/index.test.ts b/reconcile-js/src/index.test.ts index fa1e41d..1a4394f 100644 --- a/reconcile-js/src/index.test.ts +++ b/reconcile-js/src/index.test.ts @@ -1,4 +1,9 @@ -import { reconcile, reconcileWithHistory } from './index'; +import { reconcile, reconcileWithHistory, diff, undiff } from './index'; +import * as fs from 'fs'; +import * as path from 'path'; +import { fileURLToPath } from 'url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); describe('reconcile', () => { it('call reconcile without cursors', () => { @@ -44,3 +49,35 @@ describe('reconcile', () => { expect(result.history.length).toBeGreaterThan(0); }); }); + +describe('test_diff_and_undiff_are_inverse', () => { + const resourcesPath = path.join(__dirname, '../../tests/resources'); + + const readFileSlice = (fileName: string, start: number, end: number): string => { + const filePath = path.join(resourcesPath, fileName); + const content = fs.readFileSync(filePath, 'utf-8'); + const chars = Array.from(content); // Handle unicode properly + return chars.slice(start, Math.min(end, chars.length)).join(''); + }; + + const files = ['pride_and_prejudice.txt', 'room_with_a_view.txt', 'blns.txt']; + + const ranges = [{ start: 0, end: 50000 }]; + + files.forEach((file1) => { + files.forEach((file2) => { + ranges.forEach((range1) => { + ranges.forEach((range2) => { + it(`should diff & undiff ${file1}[${range1.start}..${range1.end}], ${file2}[${range2.start}..${range2.end}] without panic`, () => { + const content1 = readFileSlice(file1, range1.start, range1.end); + const content2 = readFileSlice(file2, range2.start, range2.end); + + const changes = diff(content1, content2); + const actual = undiff(content1, changes); + expect(actual).toEqual(content2); + }); + }); + }); + }); + }); +}); diff --git a/reconcile-js/src/index.ts b/reconcile-js/src/index.ts index 247db26..be7ea8f 100644 --- a/reconcile-js/src/index.ts +++ b/reconcile-js/src/index.ts @@ -4,8 +4,8 @@ import { TextWithCursors as wasmTextWithCursors, SpanWithHistory as wasmSpanWithHistory, reconcileWithHistory as wasmReconcileWithHistory, - isBinary as wasmIsBinary, - getCompactDiff as wasmGetCompactDiff, + diff as wasmDiff, + undiff as wasmUndiff, initSync, } from 'reconcile-text'; @@ -183,22 +183,22 @@ export function reconcile( /** * Generates a compact diff representation between an original and changed text. * - * These can be parsed and unpacked using Rust crate's EditedText::from_change_set. + * These can be parsed and unpacked using the `undiff` function or the Rust crate's EditedText::from_diff. + * Cursor positions are omitted from the diff result. * * This function computes the differences between two versions of text and returns - * a compact string representation of those changes. The returned format is - * serialised JSON. + * a compact representation of those changes. * * @param original - The original/base version of the text * @param changed - The modified version of the text (either string or TextWithCursors with cursor positions) * @param tokenizer - The tokenisation strategy, which is the same as used in `reconcile`. - * @returns A compact string representation of the diff between original and changed text + * @returns An array representing the compact diff, with inserts as strings and deletes as negative integers. */ -export function getCompactDiff( +export function diff( original: string, changed: string | TextWithOptionalCursors, tokenizer: BuiltinTokenizer = 'Word' -): string { +): Array { init(); if (!BUILTIN_TOKENIZERS.includes(tokenizer)) { @@ -207,13 +207,38 @@ export function getCompactDiff( const changedWasm = toWasmTextWithCursors(changed); - const result = wasmGetCompactDiff(original, changedWasm, tokenizer); + const result = wasmDiff(original, changedWasm, tokenizer); changedWasm.free(); return result; } +/** + * Applies a compact diff to an original text to reconstruct the changed version. + * + * This function takes an original text and a compact diff representation (as produced + * by the `diff` function) and reconstructs the modified text. + * + * @param original - The original/base version of the text + * @param diff - The compact diff array representing changes (inserts as strings, deletes as negative integers) + * @param tokenizer - The tokenisation strategy, which is the same as used in `reconcile`. + * @returns The reconstructed changed text as a string. + */ +export function undiff( + original: string, + diff: Array, + tokenizer: BuiltinTokenizer = 'Word' +): string { + init(); + + if (!BUILTIN_TOKENIZERS.includes(tokenizer)) { + throw new Error(UNSUPPORTED_TOKENIZER_ERROR); + } + + return wasmUndiff(original, diff, tokenizer); +} + /** * Merges three versions of text and returns detailed provenance information. * @@ -272,19 +297,6 @@ export function reconcileWithHistory( }; } -/** - * Check (using heuristics) if the given data is binary or text content. - * - * Only text inputs can be reconciled using the library's functions. - * - * @param data - The data to check for binary content. This should be a Uint8Array. - * @returns True if the data is likely binary, false if it is likely text. - */ -export function isBinary(data: Uint8Array): boolean { - init(); - return wasmIsBinary(data); -} - function init() { if (isInitialised) { return; diff --git a/reconcile-js/tsconfig.json b/reconcile-js/tsconfig.json index c462052..08dee60 100644 --- a/reconcile-js/tsconfig.json +++ b/reconcile-js/tsconfig.json @@ -9,6 +9,5 @@ "declarationDir": "./dist/types", "skipLibCheck": true, "inlineSourceMap": true - }, - "exclude": ["./dist", "**/*.test.ts"] + } } diff --git a/scripts/test.sh b/scripts/test.sh index 7eb816c..d5b1ab4 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -4,7 +4,12 @@ set -e wasm-pack build --target web --features wasm cargo test --verbose --features serde -- --include-ignored -cargo test --features serde,wasm + +cargo test +cargo test --features serde +cargo test --features wasm +cargo test --features all + wasm-pack test --node --features wasm cd reconcile-js diff --git a/src/lib.rs b/src/lib.rs index 1dd78ff..2119bea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -157,6 +157,8 @@ //! original text, making the size only depends on the changes made. //! //! ```rust +//! # #[cfg(feature = "serde")] +//! # { //! use reconcile_text::{EditedText, BuiltinTokenizer}; //! use serde_yaml; //! use pretty_assertions::assert_eq; @@ -170,20 +172,18 @@ //! &changes.into() //! ); //! -//! let serialized = serde_yaml::to_string(&result.to_change_set()).unwrap(); +//! let serialized = serde_yaml::to_string(&result.to_diff()).unwrap(); //! assert_eq!( //! serialized, //! concat!( -//! "operations:\n", //! "- 15\n", //! "- -6\n", -//! "- ' easy with reconcile!'\n", -//! "cursors: []\n" +//! "- ' easy with reconcile!'\n" //! ) //! ); //! //! let deserialized = serde_yaml::from_str(&serialized).unwrap(); -//! let reconstructed = EditedText::from_change_set( +//! let reconstructed = EditedText::from_diff( //! original, //! deserialized, //! &*BuiltinTokenizer::Word @@ -192,13 +192,17 @@ //! reconstructed.apply().text(), //! "Merging text is easy with reconcile!" //! ); +//! # } //! ``` //! //! ## Error handling //! //! The library is designed to be robust and will always produce a result, even -//! in edge cases. However, be aware that extremely large diffs may have -//! performance implications. +//! for edge cases. +//! +//! ## Performance +//! +//! Be aware that extremely large diffs may have performance implications. //! //! ## Algorithm overview //! @@ -211,13 +215,12 @@ mod tokenizer; mod types; mod utils; -pub use operation_transformation::{ChangeSet, EditedText, reconcile}; +pub use operation_transformation::{EditedText, reconcile}; pub use tokenizer::{BuiltinTokenizer, Tokenizer, token::Token}; pub use types::{ - cursor_position::CursorPosition, history::History, side::Side, - span_with_history::SpanWithHistory, text_with_cursors::TextWithCursors, + cursor_position::CursorPosition, history::History, number_or_string::NumberOrString, + side::Side, span_with_history::SpanWithHistory, text_with_cursors::TextWithCursors, }; -pub use utils::is_binary::is_binary; #[cfg(feature = "wasm")] pub mod wasm; diff --git a/src/operation_transformation.rs b/src/operation_transformation.rs index e1f173a..85e3995 100644 --- a/src/operation_transformation.rs +++ b/src/operation_transformation.rs @@ -1,12 +1,10 @@ mod edited_text; mod operation; -mod transport; mod utils; use std::fmt::Debug; pub use edited_text::EditedText; pub use operation::Operation; -pub use transport::ChangeSet; use crate::{Tokenizer, types::text_with_cursors::TextWithCursors}; diff --git a/src/operation_transformation/edited_text.rs b/src/operation_transformation/edited_text.rs index a6465fe..f27fea4 100644 --- a/src/operation_transformation/edited_text.rs +++ b/src/operation_transformation/edited_text.rs @@ -4,15 +4,17 @@ use std::{fmt::Debug, vec}; use serde::{Deserialize, Serialize}; use crate::{ - BuiltinTokenizer, ChangeSet, CursorPosition, TextWithCursors, + BuiltinTokenizer, CursorPosition, TextWithCursors, operation_transformation::{ Operation, - transport::SimpleOperation, utils::{cook_operations::cook_operations, elongate_operations::elongate_operations}, }, raw_operation::RawOperation, tokenizer::Tokenizer, - types::{history::History, side::Side, span_with_history::SpanWithHistory}, + types::{ + history::History, number_or_string::NumberOrString, side::Side, + span_with_history::SpanWithHistory, + }, utils::string_builder::StringBuilder, }; @@ -105,6 +107,11 @@ where /// from the same original text. The operations are merged using the /// principles of Operational Transformation. The cursors are updated /// accordingly to reflect the changes made by the merged operations. + /// + /// # Panics + /// + /// Panics if there's an integer overflow (in i64) when calculating new + /// cursor positions. #[must_use] #[allow(clippy::too_many_lines)] pub fn merge(self, other: Self) -> Self { @@ -166,13 +173,14 @@ where let result = operation.merge_operations(&mut last_other_op); if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { - let merged_length_signed = - isize::try_from(merged_length).unwrap_or(isize::MAX); - let seen_left_length_signed = - isize::try_from(seen_left_length).unwrap_or(isize::MAX); - let op_len_signed = isize::try_from(op.len()).unwrap_or(isize::MAX); - let original_length_signed = - isize::try_from(original_length).unwrap_or(isize::MAX); + let merged_length_signed = isize::try_from(merged_length) + .expect("merged_length must fit in isize"); + let seen_left_length_signed = isize::try_from(seen_left_length) + .expect("seen_left_length must fit in isize"); + let op_len_signed = + isize::try_from(op.len()).expect("op.len() must fit in isize"); + let original_length_signed = isize::try_from(original_length) + .expect("original_length must fit in isize"); let shift = merged_length_signed - seen_left_length_signed + op_len_signed - original_length_signed; @@ -199,13 +207,14 @@ where let result = operation.merge_operations(&mut last_other_op); if let ref op @ (Operation::Insert { .. } | Operation::Equal { .. }) = result { - let merged_length_signed = - isize::try_from(merged_length).unwrap_or(isize::MAX); - let seen_right_length_signed = - isize::try_from(seen_right_length).unwrap_or(isize::MAX); - let op_len_signed = isize::try_from(op.len()).unwrap_or(isize::MAX); - let original_length_signed = - isize::try_from(original_length).unwrap_or(isize::MAX); + let merged_length_signed = isize::try_from(merged_length) + .expect("merged_length must fit in isize"); + let seen_right_length_signed = isize::try_from(seen_right_length) + .expect("seen_right_length must fit in isize"); + let op_len_signed = + isize::try_from(op.len()).expect("op.len() must fit in isize"); + let original_length_signed = isize::try_from(original_length) + .expect("original_length must fit in isize"); let shift = merged_length_signed - seen_right_length_signed + op_len_signed - original_length_signed; @@ -345,34 +354,122 @@ where history } - /// Serialize the `EditedText` as a `ChangeSet`, which contains only - /// the operations and cursor positions, but without the original text. - /// This is useful for sending changes over the network if there's - /// a clear consensus on the original text. + /// Convert the `EditedText` into a terse representation ready for + /// serialization. The result omits cursor positions and the original text. + /// This is useful for sending text diffs over the network if there's a + /// clear consensus on the original text. + /// + /// Inserts are represented as strings, deletes as negative integers, + /// and equal spans as positive integers. + /// + /// # Panics + /// + /// Panics if there's an integer overflow in i64. #[must_use] - pub fn to_change_set(&self) -> ChangeSet { - ChangeSet::new( - SimpleOperation::from_operations(&self.operations), - self.cursors.clone(), - ) + pub fn to_diff(&self) -> Vec { + let mut result: Vec = Vec::with_capacity(self.operations.len()); + let mut previous_equal: Option = None; + + for operation in &self.operations { + match operation { + Operation::Equal { length, .. } => { + if let Some(prev_length) = previous_equal { + previous_equal = Some(prev_length + *length); + } else { + previous_equal = Some(*length); + } + } + + Operation::Insert { text, .. } => { + if let Some(prev_length) = previous_equal { + result.push(NumberOrString::Number( + i64::try_from(prev_length).expect("prev_length must fit in i64"), + )); + previous_equal = None; + } + + let text: String = text + .iter() + .map(super::super::tokenizer::token::Token::original) + .collect(); + result.push(NumberOrString::Text(text)); + } + + Operation::Delete { + deleted_character_count, + .. + } => { + if let Some(prev_length) = previous_equal { + result.push(NumberOrString::Number( + i64::try_from(prev_length).expect("prev_length must fit in i64"), + )); + previous_equal = None; + } + + let count = i64::try_from(*deleted_character_count) + .expect("deleted_character_count must fit in i64"); + result.push(NumberOrString::Number(-count)); + } + } + } + + if let Some(prev_length) = previous_equal { + result.push(NumberOrString::Number( + i64::try_from(prev_length).expect("prev_length must fit in i64"), + )); + } + + result } - /// Deserialize an `EditedText` from a `ChangeSet` and the original text. - /// This is useful for reconstructing the `EditedText` on the receiving - /// end after sending only the `ChangeSet` over the network. + /// Deserialize an `EditedText` from a change list and the original text. + /// + /// # Panics + /// + /// Panics if there's an integer overflow in i64. #[must_use] - pub fn from_change_set( - text: &'a str, - change_set: ChangeSet, + pub fn from_diff( + original_text: &'a str, + diff: Vec, tokenizer: &Tokenizer, ) -> EditedText<'a, T> { - let operations = SimpleOperation::to_operations(change_set.operations, text, tokenizer); + let mut operations: Vec> = Vec::with_capacity(diff.len()); + let mut order = 0; + + for item in diff { + match item { + NumberOrString::Number(length) => { + if length >= 0 { + let length = usize::try_from(length).expect("length must fit in usize"); + let original_characters: String = + original_text.chars().skip(order).take(length).collect(); + + let original_tokens = tokenizer(&original_characters); + for token in original_tokens { + operations + .push(Operation::create_equal(order, token.get_original_length())); + order += token.get_original_length(); + } + } else { + let length = + usize::try_from(-length).expect("negative length must fit in usize"); + operations.push(Operation::create_delete(order, length)); + order += length; + } + } + NumberOrString::Text(text) => { + let tokens = tokenizer(&text); + operations.push(Operation::create_insert(order, tokens)); + } + } + } + let operation_count = operations.len(); EditedText::new( - text, + original_text, operations, vec![Side::Left; operation_count], - change_set.cursors, + vec![], ) } } @@ -423,34 +520,29 @@ mod tests { assert_eq!(operations.apply().text(), expected); } + #[cfg(feature = "serde")] #[test] - fn test_change_set_deserialisation() { + fn test_changes_deserialisation() { let original = "Merging text is hard!"; let changes = "Merging text is easy with reconcile!"; let result = EditedText::from_strings(original, &changes.into()); - let serialized = serde_yaml::to_string(&result.to_change_set()).unwrap(); - - let expected = concat!( - "operations:\n", - "- 15\n", - "- -6\n", - "- ' easy with reconcile!'\n", - "cursors: []\n" - ); + let serialized = serde_yaml::to_string(&result.to_diff()).unwrap(); + let expected = concat!("- 15\n", "- -6\n", "- ' easy with reconcile!'\n",); assert_eq!(serialized, expected); } + #[cfg(feature = "serde")] #[test] - fn test_change_set_serialization() { + fn test_changes_serialization() { let original = "The quick brown fox jumps over the lazy dog."; let updated = "The quick red fox jumped over the very lazy dog!"; let edited_text = EditedText::from_strings(original, &updated.into()); - let change_set = edited_text.to_change_set(); + let changes = edited_text.to_diff(); let deserialized_edited_text = - EditedText::from_change_set(original, change_set, &*BuiltinTokenizer::Word); + EditedText::from_diff(original, changes, &*BuiltinTokenizer::Word); assert_eq!(deserialized_edited_text.apply().text(), updated); } diff --git a/src/operation_transformation/transport.rs b/src/operation_transformation/transport.rs deleted file mode 100644 index 67c25e5..0000000 --- a/src/operation_transformation/transport.rs +++ /dev/null @@ -1,204 +0,0 @@ -use std::fmt::Debug; - -#[cfg(feature = "serde")] -use serde::{ - Deserialize, Serialize, - de::{self, Deserializer, Visitor}, - ser::Serializer, -}; - -use crate::{CursorPosition, Tokenizer, operation_transformation::Operation}; - -#[derive(Clone, PartialEq, Eq, Debug)] -pub enum SimpleOperation { - Equal { length: usize }, - Insert { text: String }, - Delete { length: usize }, -} - -impl SimpleOperation { - pub fn from_operations(operation: &Vec>) -> Vec - where - T: PartialEq + Clone + Debug, - { - let mut result: Vec = Vec::with_capacity(operation.len()); - let mut previous_equal: Option = None; - - for operation in operation { - match operation { - Operation::Equal { length, .. } => { - if let Some(prev_length) = previous_equal { - previous_equal = Some(prev_length + *length); - } else { - previous_equal = Some(*length); - } - } - - Operation::Insert { text, .. } => { - if let Some(prev_length) = previous_equal { - result.push(SimpleOperation::Equal { - length: prev_length, - }); - previous_equal = None; - } - - let text: String = text - .iter() - .map(super::super::tokenizer::token::Token::original) - .collect(); - result.push(SimpleOperation::Insert { text }); - } - - Operation::Delete { - deleted_character_count, - .. - } => { - if let Some(prev_length) = previous_equal { - result.push(SimpleOperation::Equal { - length: prev_length, - }); - previous_equal = None; - } - - result.push(SimpleOperation::Delete { - length: *deleted_character_count, - }); - } - } - } - - if let Some(prev_length) = previous_equal { - result.push(SimpleOperation::Equal { - length: prev_length, - }); - } - - result - } - - // This is similar to `crate::operation_transformation::utils::cook_operations` - pub fn to_operations( - simple_operations: Vec, - original_text: &str, - tokenizer: &Tokenizer, - ) -> Vec> - where - T: PartialEq + Clone + Debug, - { - let mut operations: Vec> = Vec::with_capacity(simple_operations.len()); - let mut order = 0; - - for simple_operation in simple_operations { - match simple_operation { - SimpleOperation::Equal { length } => { - let original_characters: String = - original_text.chars().skip(order).take(length).collect(); - - let original_tokens = tokenizer(&original_characters); - for token in original_tokens { - operations - .push(Operation::create_equal(order, token.get_original_length())); - order += token.get_original_length(); - } - } - - SimpleOperation::Insert { text } => { - let tokens = tokenizer(&text); - operations.push(Operation::create_insert(order, tokens)); - } - - SimpleOperation::Delete { length } => { - operations.push(Operation::create_delete(order, length)); - order += length; - } - } - } - - operations - } -} - -#[cfg(feature = "serde")] -impl Serialize for SimpleOperation { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - // neat idea from https://github.com/spebern/operational-transform-rs/blob/9faa17f0a2b282ac2e09dbb2d29fdaf2ae0bbb4a/operational-transform/src/serde.rs#L14 - match self { - SimpleOperation::Equal { length } => serializer.serialize_u64(*length as u64), - SimpleOperation::Insert { text } => serializer.serialize_str(text), - SimpleOperation::Delete { length } => { - serializer.serialize_i64(-(i64::try_from(*length).unwrap_or(i64::MAX))) - } - } - } -} - -#[cfg(feature = "serde")] -impl<'de> Deserialize<'de> for SimpleOperation { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - use std::fmt; - - struct OperationVisitor; - - impl Visitor<'_> for OperationVisitor { - type Value = SimpleOperation; - - fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - formatter.write_str("an integer between -2^63 and 2^64-1 or a string") - } - - fn visit_u64(self, value: u64) -> Result - where - E: de::Error, - { - Ok(SimpleOperation::Equal { - length: usize::try_from(value).unwrap_or(usize::MAX), - }) - } - - fn visit_i64(self, value: i64) -> Result - where - E: de::Error, - { - Ok(SimpleOperation::Delete { - length: usize::try_from(-value).unwrap_or(usize::MAX), - }) - } - - fn visit_str(self, value: &str) -> Result - where - E: de::Error, - { - Ok(SimpleOperation::Insert { - text: value.to_owned(), - }) - } - } - - deserializer.deserialize_any(OperationVisitor) - } -} - -/// A serializable representation of the changes made to a text document -/// without the original text. -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Debug, Clone, PartialEq, Default)] -pub struct ChangeSet { - pub operations: Vec, - pub cursors: Vec, -} - -impl ChangeSet { - #[must_use] - pub fn new(operations: Vec, cursors: Vec) -> Self { - Self { - operations, - cursors, - } - } -} diff --git a/src/types.rs b/src/types.rs index b32ef9a..b5c2f7c 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,5 +1,6 @@ pub mod cursor_position; pub mod history; +pub mod number_or_string; pub mod side; pub mod span_with_history; pub mod text_with_cursors; diff --git a/src/types/number_or_string.rs b/src/types/number_or_string.rs new file mode 100644 index 0000000..7272a60 --- /dev/null +++ b/src/types/number_or_string.rs @@ -0,0 +1,74 @@ +use std::fmt::Debug; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; +#[cfg(feature = "wasm")] +use wasm_bindgen::prelude::*; + +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serde", serde(untagged))] +#[derive(Debug, Clone, PartialEq)] +pub enum NumberOrString { + Number(i64), + Text(String), +} + +#[cfg(feature = "wasm")] +impl TryFrom for NumberOrString { + type Error = DeserialisationError; + + fn try_from(value: JsValue) -> Result { + if let Ok(num) = value.clone().try_into() { + return Ok(NumberOrString::Number(num)); + } + + if let Ok(text) = value.try_into() { + return Ok(NumberOrString::Text(text)); + } + + Err(DeserialisationError::new( + "Could not parse JsValue as either number or string", + )) + } +} + +#[cfg(feature = "wasm")] +impl From for JsValue { + fn from(value: NumberOrString) -> Self { + match value { + NumberOrString::Number(num) => JsValue::from(num), + NumberOrString::Text(text) => JsValue::from(text), + } + } +} + +/// Error type for deserialisation failures +#[cfg(feature = "wasm")] +#[derive(Debug, Clone)] +pub struct DeserialisationError { + pub message: String, +} + +#[cfg(feature = "wasm")] +impl DeserialisationError { + pub fn new(message: impl Into) -> Self { + Self { + message: message.into(), + } + } +} + +#[cfg(feature = "wasm")] +impl std::fmt::Display for DeserialisationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Deserialisation error: {}", self.message) + } +} + +#[cfg(feature = "wasm")] +impl std::error::Error for DeserialisationError {} + +#[cfg(feature = "wasm")] +impl From for JsValue { + fn from(error: DeserialisationError) -> Self { JsValue::from_str(&error.message) } +} diff --git a/src/utils.rs b/src/utils.rs index f249825..2e05a70 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,6 +1,5 @@ pub mod common_prefix_len; pub mod common_suffix_len; pub mod find_longest_prefix_contained_within; -pub mod is_binary; pub mod myers_diff; pub mod string_builder; diff --git a/src/utils/is_binary.rs b/src/utils/is_binary.rs deleted file mode 100644 index 09bfcf9..0000000 --- a/src/utils/is_binary.rs +++ /dev/null @@ -1,26 +0,0 @@ -/// Heuristically determine if the given data is a binary or a text file's -/// content. -/// -/// Only text inputs can be reconciled using the crate's functions. -#[must_use] -pub fn is_binary(data: &[u8]) -> bool { - if data.contains(&0) { - // Even though the NUL character is valid in UTF-8, it's highly suspicious in - // human-readable text. - return true; - } - - std::str::from_utf8(data).is_err() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_binary() { - assert!(is_binary(&[0, 159, 146, 150])); - assert!(is_binary(&[0, 12])); - assert!(!is_binary(b"hello")); - } -} diff --git a/src/utils/myers_diff.rs b/src/utils/myers_diff.rs index 776e448..c89788d 100644 --- a/src/utils/myers_diff.rs +++ b/src/utils/myers_diff.rs @@ -87,7 +87,7 @@ struct V { impl V { fn new(max_d: usize) -> Self { // max_d should fit in isize for the algorithm to work correctly - let offset = isize::try_from(max_d).unwrap_or(isize::MAX); + let offset = isize::try_from(max_d).expect("max_d must fit in isize"); Self { offset, v: vec![0; 2 * max_d], @@ -101,16 +101,15 @@ impl Index for V { type Output = usize; fn index(&self, index: isize) -> &Self::Output { - let idx = usize::try_from(index + self.offset).unwrap_or(usize::MAX); - &self.v[idx.min(self.v.len().saturating_sub(1))] + let idx = usize::try_from(index + self.offset).expect("index + offset must fit in usize"); + &self.v[idx] } } impl IndexMut for V { fn index_mut(&mut self, index: isize) -> &mut Self::Output { - let idx = usize::try_from(index + self.offset).unwrap_or(usize::MAX); - let len = self.v.len(); - &mut self.v[idx.min(len.saturating_sub(1))] + let idx = usize::try_from(index + self.offset).expect("index + offset must fit in usize"); + &mut self.v[idx] } } @@ -145,7 +144,8 @@ where // By Lemma 1 in the paper, the optimal edit script length is odd or even as // `delta` is odd or even. - let delta = isize::try_from(n).unwrap_or(isize::MAX) - isize::try_from(m).unwrap_or(isize::MAX); + let delta = isize::try_from(n).expect("n must fit in isize") + - isize::try_from(m).expect("m must fit in isize"); let odd = delta & 1 == 1; // The initial point at (0, -1) @@ -157,7 +157,7 @@ where assert!(vf.len() >= d_max); assert!(vb.len() >= d_max); - let d_max_isize = isize::try_from(d_max).unwrap_or(isize::MAX); + let d_max_isize = isize::try_from(d_max).expect("d_max must fit in isize"); for d in 0..d_max_isize { // Forward path for k in (-d..=d).rev().step_by(2) { @@ -166,7 +166,8 @@ where } else { vf[k - 1] + 1 }; - let y = usize::try_from(isize::try_from(x).unwrap_or(isize::MAX) - k).unwrap_or(0); + let y = usize::try_from(isize::try_from(x).expect("x must fit in isize") - k) + .expect("x - k must be non-negative and fit in usize"); // The coordinate of the start of a snake let (x0, y0) = (x, y); @@ -204,7 +205,8 @@ where } else { vb[k - 1] + 1 }; - let mut y = usize::try_from(isize::try_from(x).unwrap_or(isize::MAX) - k).unwrap_or(0); + let mut y = usize::try_from(isize::try_from(x).expect("x must fit in isize") - k) + .expect("x - k must be non-negative and fit in usize"); // The coordinate of the start of a snake if x < n && y < m { diff --git a/src/wasm.rs b/src/wasm.rs index 0fd0aca..1b7a24b 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -3,7 +3,7 @@ use core::str; use wasm_bindgen::prelude::*; -use crate::{BuiltinTokenizer, CursorPosition, SpanWithHistory, TextWithCursors}; +use crate::{BuiltinTokenizer, CursorPosition, EditedText, SpanWithHistory, TextWithCursors}; #[global_allocator] static ALLOC: wee_alloc::WeeAlloc<'_> = wee_alloc::WeeAlloc::INIT; @@ -32,6 +32,7 @@ pub fn reconcile_with_history( tokenizer: BuiltinTokenizer, ) -> TextWithCursorsAndHistory { set_panic_hook(); + let reconciled = crate::reconcile(parent, left, right, &*tokenizer); let text_with_cursors = reconciled.apply(); @@ -54,10 +55,6 @@ pub fn reconcile_with_history( /// # Returns /// /// The merged document. -/// -/// # Panics -/// -/// If any of the input documents are not valid UTF-8 strings. #[wasm_bindgen(js_name = genericReconcile)] #[must_use] pub fn generic_reconcile( @@ -68,51 +65,56 @@ pub fn generic_reconcile( ) -> Vec { set_panic_hook(); - if crate::is_binary(parent) || crate::is_binary(left) || crate::is_binary(right) { - right.to_vec() + if let (Some(parent), Some(left), Some(right)) = ( + string_or_nothing(parent), + string_or_nothing(left), + string_or_nothing(right), + ) { + crate::reconcile(&parent, &left.into(), &right.into(), &*tokenizer) + .apply() + .text() + .into_bytes() } else { - crate::reconcile( - str::from_utf8(parent).expect("parent must be valid UTF-8 because it's not binary"), - &str::from_utf8(left) - .expect("left must be valid UTF-8 because it's not binary") - .into(), - &str::from_utf8(right) - .expect("right must be valid UTF-8 because it's not binary") - .into(), - &*tokenizer, - ) - .apply() - .text() - .into_bytes() + right.to_vec() } } -/// WASM wrapper around getting a compact diff representation as a JSON string +/// WASM wrapper around getting a compact diff representation of two texts as a +/// list of numbers and strings. +#[wasm_bindgen(js_name = diff)] +#[must_use] +pub fn diff(parent: &str, changed: &TextWithCursors, tokenizer: BuiltinTokenizer) -> Vec { + set_panic_hook(); + + let edited_text = EditedText::from_strings_with_tokenizer(parent, changed, &*tokenizer); + edited_text + .to_diff() + .into_iter() + .map(std::convert::Into::into) + .collect() +} + +/// Inverse of `diff`, applies a compact diff representation to a parent text /// /// # Panics /// -/// If serialization to JSON fails which should not happen -#[wasm_bindgen(js_name = getCompactDiff)] +/// Panics if the diff format is invalid or there's an integer overflow when +/// applying the diff. +#[wasm_bindgen(js_name = undiff)] #[must_use] -pub fn get_compact_diff( - parent: &str, - changed: &TextWithCursors, - tokenizer: BuiltinTokenizer, -) -> String { +pub fn undiff(parent: &str, diff: Vec, tokenizer: BuiltinTokenizer) -> String { set_panic_hook(); - let edited_text = crate::EditedText::from_strings_with_tokenizer(parent, changed, &*tokenizer); - let change_set = edited_text.to_change_set(); - serde_json::to_string(&change_set).expect("Failed to serialize change set") -} - -/// Heuristically determine if the given data is a binary or a text file's -/// content. -#[wasm_bindgen(js_name = isBinary)] -#[must_use] -pub fn is_binary(data: &[u8]) -> bool { - set_panic_hook(); - crate::is_binary(data) + EditedText::from_diff( + parent, + diff.into_iter() + .map(std::convert::TryInto::try_into) + .collect::>() + .expect("Invalid diff format"), + &*tokenizer, + ) + .apply() + .text() } fn set_panic_hook() { @@ -140,3 +142,30 @@ impl TextWithCursorsAndHistory { #[must_use] pub fn history(&self) -> Vec { self.history.clone() } } + +/// Returns the UTF8 parsed string if it's a text, or `None` if it's likely +/// binary. +#[must_use] +fn string_or_nothing(data: &[u8]) -> Option { + if data.contains(&0) { + // Even though the NUL character is valid in UTF-8, it's highly suspicious in + // human-readable text. + return None; + } + + std::str::from_utf8(data) + .map(std::borrow::ToOwned::to_owned) + .ok() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_string_or_nothing() { + assert_eq!(string_or_nothing(&[0, 159, 146, 150]), None); + assert_eq!(string_or_nothing(&[0, 12]), None); + assert_eq!(string_or_nothing(b"hello"), Some("hello".into())); + } +} diff --git a/tests/test.rs b/tests/test.rs index e8fae7d..2b14b86 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -3,7 +3,7 @@ mod example_document; use std::{fs, path::Path}; use example_document::ExampleDocument; -use reconcile_text::{BuiltinTokenizer, EditedText, reconcile}; +use reconcile_text::{BuiltinTokenizer, reconcile}; use serde::Deserialize; #[test] @@ -34,8 +34,11 @@ fn test_document_one_way_with_cursors() { } } +#[cfg(feature = "serde")] #[test] -fn test_document_one_way_with_cursors_and_serialisation() { +fn test_document_one_way_with_serialisation() { + use reconcile_text::EditedText; + for doc in &get_all_documents() { let parent = doc.parent(); let left_operations = @@ -47,19 +50,23 @@ fn test_document_one_way_with_cursors_and_serialisation() { ); let serialised_left = - serde_yaml::from_str(&serde_yaml::to_string(&left_operations.to_change_set()).unwrap()) + serde_yaml::from_str(&serde_yaml::to_string(&left_operations.to_diff()).unwrap()) + .unwrap(); + let serialised_right = + serde_yaml::from_str(&serde_yaml::to_string(&right_operations.to_diff()).unwrap()) .unwrap(); - let serialised_right = serde_yaml::from_str( - &serde_yaml::to_string(&right_operations.to_change_set()).unwrap(), - ) - .unwrap(); let restored_left_operations = - EditedText::from_change_set(&parent, serialised_left, &*BuiltinTokenizer::Word); + EditedText::from_diff(&parent, serialised_left, &*BuiltinTokenizer::Word); let restored_right_operations = - EditedText::from_change_set(&parent, serialised_right, &*BuiltinTokenizer::Word); + EditedText::from_diff(&parent, serialised_right, &*BuiltinTokenizer::Word); - doc.assert_eq(&restored_left_operations.merge(restored_right_operations)); + doc.assert_eq_without_cursors( + &restored_left_operations + .merge(restored_right_operations) + .apply() + .text(), + ); } } diff --git a/tests/wasm.rs b/tests/wasm.rs index 6a9d556..304ee6e 100644 --- a/tests/wasm.rs +++ b/tests/wasm.rs @@ -55,22 +55,16 @@ fn test_merge_binary() { ); } -#[wasm_bindgen_test(unsupported = test)] -fn test_is_binary() { - assert!(is_binary(&[0, 159, 146, 150])); - assert!(is_binary(&[0, 12])); - assert!(!is_binary(b"hello")); -} - -#[wasm_bindgen_test(unsupported = test)] -fn test_get_compact_diff() { +#[wasm_bindgen_test] // JsValue isn't supported outside of wasm +fn test_diff() { let parent = "hello "; let changed = "world"; - let result = get_compact_diff(parent, &changed.into(), BuiltinTokenizer::Word); - assert_eq!(result, "{\"operations\":[-6,\"world\"],\"cursors\":[]}"); -} -#[wasm_bindgen_test(unsupported = test)] -fn test_is_binary_empty() { - assert!(!is_binary(b"")); + let result = diff(parent, &changed.into(), BuiltinTokenizer::Word); + + assert_eq!(result.len(), 2); + let first: i64 = result[0].clone().try_into().unwrap(); + let second: String = result[1].clone().try_into().unwrap(); + assert_eq!(first, -6); + assert_eq!(second, "world"); }