diff --git a/frontend/deterministic-tests/src/test-registry.ts b/frontend/deterministic-tests/src/test-registry.ts index 2ecd7d37..dfe267d2 100644 --- a/frontend/deterministic-tests/src/test-registry.ts +++ b/frontend/deterministic-tests/src/test-registry.ts @@ -103,6 +103,7 @@ import { renamedPendingCreateReusedPathThenDeleteTest } from "./tests/renamed-pe import { renamePendingCreateOntoPendingDeletePathTest } from "./tests/rename-pending-create-onto-pending-delete-path.test"; import { remoteQuickWriteRenameBeforeRecordTest } from "./tests/remote-quick-write-rename-before-record.test"; import { selfMergePendingRenameAliasesSecondCreateTest } from "./tests/self-merge-pending-rename-aliases-second-create.test"; +import { disableMidCreateThenDeleteTest } from "./tests/disable-mid-create-then-delete.test"; export const TESTS: Partial> = { "rename-create-conflict": renameCreateConflictTest, @@ -239,5 +240,6 @@ export const TESTS: Partial> = { "remote-quick-write-rename-before-record": remoteQuickWriteRenameBeforeRecordTest, "self-merge-pending-rename-aliases-second-create": - selfMergePendingRenameAliasesSecondCreateTest + selfMergePendingRenameAliasesSecondCreateTest, + "disable-mid-create-then-delete": disableMidCreateThenDeleteTest }; diff --git a/frontend/deterministic-tests/src/tests/disable-mid-create-then-delete.test.ts b/frontend/deterministic-tests/src/tests/disable-mid-create-then-delete.test.ts new file mode 100644 index 00000000..5ff1d529 --- /dev/null +++ b/frontend/deterministic-tests/src/tests/disable-mid-create-then-delete.test.ts @@ -0,0 +1,56 @@ +import type { AssertableState } from "../utils/assertable-state"; +import type { TestDefinition } from "../test-definition"; + +export const disableMidCreateThenDeleteTest: TestDefinition = { + description: + "Reproduces a fuzz failure where one client's create-then-delete-then-disable-sync " + + "sequence loses the file: the create commits server-side, the response is " + + "lost (sync reset), the local file is deleted, then sync is re-enabled. The " + + "catch-up replay should redeliver the create so both clients converge to " + + "having the file (the delete never reached the server because its docId " + + "Promise was rejected when the queue cleared).", + clients: 2, + steps: [ + // Client 0 is online (the witness); client 1 starts disabled. + { type: "enable-sync", client: 0 }, + + // Client 1 creates the file while offline so the LocalCreate is queued. + { type: "create", client: 1, path: "file-32.md", content: "hello" }, + + // Arm the drop so client 1's create POST commits server-side but the + // response is replaced with SyncResetError (matches the fuzz scenario + // where sync was disabled mid-flight). + { type: "drop-next-create-response", client: 1 }, + + // Enable sync on client 1: offline scan picks up file-32, drain fires + // POST /documents, server commits, broadcast goes out, response is + // dropped on the client. SyncResetError exits the drain leaving the + // create event still in the queue. + { type: "enable-sync", client: 1 }, + { type: "wait-for-dropped-create-response", client: 1 }, + + // The user then deletes the file locally and toggles sync off/on + // (the same flow the fuzz harness used). The disable's pause() + // does not clear the queue, but the re-enable runs an offline + // scan that calls clearPending() — wiping the dangling LocalCreate + // and any LocalDelete behind it. The local disk is empty, so + // nothing is enqueued. + { type: "delete", client: 1, path: "file-32.md" }, + { type: "disable-sync", client: 1 }, + { type: "enable-sync", client: 1 }, + + // Catch-up on the new WS connection should deliver file-32 (vault + // update id 1) since client 1's lastSeenUpdateId is still 0. + { type: "barrier" }, + + { + type: "assert-consistent", + verify: (s: AssertableState): void => { + s.assertFileExists("file-32.md").assertContent( + "file-32.md", + "hello" + ); + } + } + ] +}; diff --git a/scripts/clean-up.sh b/scripts/clean-up.sh index dcf400bb..267a1019 100755 --- a/scripts/clean-up.sh +++ b/scripts/clean-up.sh @@ -1,4 +1,4 @@ #!/bin/bash -rm -rf /host/tmp/vaultlink-e2e-databases +rm -rf /tmp/vaultlink-e2e-databases rm -rf logs diff --git a/scripts/e2e.sh b/scripts/e2e.sh index 7ab8d90c..abc3dcd2 100755 --- a/scripts/e2e.sh +++ b/scripts/e2e.sh @@ -31,7 +31,7 @@ sleep 1 # Clean databases (uses tmpfs via /dev/shm for zero disk I/O) echo "Cleaning databases..." -rm -rf /host/tmp/vaultlink-e2e-databases +rm -rf /tmp/databases # Start the server in the background echo "Starting server..." diff --git a/sync-server/src/app_state/database.rs b/sync-server/src/app_state/database.rs index c9122538..1a2483a2 100644 --- a/sync-server/src/app_state/database.rs +++ b/sync-server/src/app_state/database.rs @@ -793,23 +793,18 @@ impl Database { .await .context("Failed to commit transaction")?; - // For non-delete writes the originating device already has - // authoritative state from its HTTP response, so we tag the - // broadcast with `origin_device_id` and the send task in - // `websocket.rs` filters it out for that device. Deletes are - // delivered to *every* connected client including the author — - // the originator only removes the document from its sync queue - // once it receives this receipt. + // Broadcast every commit to every connected client, including + // the originator. The HTTP response is the originator's normal + // path to learn its own update, but if the response is lost + // (sync reset, dropped TCP) the broadcast is the only remaining + // delivery channel — and the client-side `parentVersionId` + // dedup absorbs the redundant message when the response made it + // through. let envelope = WebSocketServerMessage::VaultUpdate(WebSocketVaultUpdate { document: version.clone().into(), }); - let with_origin = if version.is_deleted { - WebSocketServerMessageWithOrigin::new(envelope) - } else { - WebSocketServerMessageWithOrigin::with_origin(version.device_id.clone(), envelope) - }; self.broadcasts - .send_document_update(vault_id, with_origin)?; + .send_document_update(vault_id, WebSocketServerMessageWithOrigin::new(envelope))?; Ok(()) } diff --git a/sync-server/src/app_state/websocket/models.rs b/sync-server/src/app_state/websocket/models.rs index 8a8d42cc..ebe4018b 100644 --- a/sync-server/src/app_state/websocket/models.rs +++ b/sync-server/src/app_state/websocket/models.rs @@ -80,28 +80,13 @@ pub enum WebSocketServerMessage { CursorPositions(CursorPositionFromServer), } -/// Broadcast envelope carrying the message plus the device that produced -/// it. The per-recipient send task compares `origin_device_id` against -/// its own device id to fill in `originates_from_self` before the message -/// is serialized on the wire. #[derive(Clone, Debug)] pub struct WebSocketServerMessageWithOrigin { - pub origin_device_id: Option, pub message: WebSocketServerMessage, } impl WebSocketServerMessageWithOrigin { pub fn new(message: WebSocketServerMessage) -> Self { - Self { - origin_device_id: None, - message, - } - } - - pub fn with_origin(origin_device_id: DeviceId, message: WebSocketServerMessage) -> Self { - Self { - origin_device_id: Some(origin_device_id), - message, - } + Self { message } } } diff --git a/sync-server/src/server/websocket.rs b/sync-server/src/server/websocket.rs index 1bf49dbf..e2db6ca9 100644 --- a/sync-server/src/server/websocket.rs +++ b/sync-server/src/server/websocket.rs @@ -208,14 +208,24 @@ async fn websocket( loop { match broadcast_receiver.recv().await { Ok(update) => { - // Drop messages this device authored because the HTTP - // response already carried authoritative state back. - // Delete broadcasts are sent without an origin so the - // author also receives them — that's the receipt the - // client needs to drop the doc from its sync queue. - if Some(&device_id) == update.origin_device_id.as_ref() { - continue; - } + // Always deliver vault updates to the originating + // device too. The HTTP response is the *normal* path + // for the originator to learn its own update, and + // the client-side wire loop dedupes redundant + // broadcasts via the `parentVersionId` check. But + // when the response is lost mid-flight (sync reset, + // pause/resume, dropped TCP) the originator has no + // record of the doc; if the broadcast is also + // suppressed AND the next handshake's cursor was + // captured before the commit (cursor < vuid), the + // doc falls through both delivery paths and is + // stranded forever on the originator. Letting the + // self-broadcast through closes that window — the + // message processes as a remote create on the + // originator's reconnected WS and the file is + // restored. (Cursor messages still get the inner + // self-filter below; we drop our own cursor entries + // from the `clients` payload.) // Filter out vault updates already covered by the // catch-up snapshot. The handshake atomically