Fix slow commit bug

This commit is contained in:
Andras Schmelczer 2026-05-10 15:08:40 +01:00
parent eb23f445d0
commit 0329fc29f2
7 changed files with 88 additions and 40 deletions

View file

@ -103,6 +103,7 @@ import { renamedPendingCreateReusedPathThenDeleteTest } from "./tests/renamed-pe
import { renamePendingCreateOntoPendingDeletePathTest } from "./tests/rename-pending-create-onto-pending-delete-path.test";
import { remoteQuickWriteRenameBeforeRecordTest } from "./tests/remote-quick-write-rename-before-record.test";
import { selfMergePendingRenameAliasesSecondCreateTest } from "./tests/self-merge-pending-rename-aliases-second-create.test";
import { disableMidCreateThenDeleteTest } from "./tests/disable-mid-create-then-delete.test";
export const TESTS: Partial<Record<string, TestDefinition>> = {
"rename-create-conflict": renameCreateConflictTest,
@ -239,5 +240,6 @@ export const TESTS: Partial<Record<string, TestDefinition>> = {
"remote-quick-write-rename-before-record":
remoteQuickWriteRenameBeforeRecordTest,
"self-merge-pending-rename-aliases-second-create":
selfMergePendingRenameAliasesSecondCreateTest
selfMergePendingRenameAliasesSecondCreateTest,
"disable-mid-create-then-delete": disableMidCreateThenDeleteTest
};

View file

@ -0,0 +1,56 @@
import type { AssertableState } from "../utils/assertable-state";
import type { TestDefinition } from "../test-definition";
export const disableMidCreateThenDeleteTest: TestDefinition = {
description:
"Reproduces a fuzz failure where one client's create-then-delete-then-disable-sync " +
"sequence loses the file: the create commits server-side, the response is " +
"lost (sync reset), the local file is deleted, then sync is re-enabled. The " +
"catch-up replay should redeliver the create so both clients converge to " +
"having the file (the delete never reached the server because its docId " +
"Promise was rejected when the queue cleared).",
clients: 2,
steps: [
// Client 0 is online (the witness); client 1 starts disabled.
{ type: "enable-sync", client: 0 },
// Client 1 creates the file while offline so the LocalCreate is queued.
{ type: "create", client: 1, path: "file-32.md", content: "hello" },
// Arm the drop so client 1's create POST commits server-side but the
// response is replaced with SyncResetError (matches the fuzz scenario
// where sync was disabled mid-flight).
{ type: "drop-next-create-response", client: 1 },
// Enable sync on client 1: offline scan picks up file-32, drain fires
// POST /documents, server commits, broadcast goes out, response is
// dropped on the client. SyncResetError exits the drain leaving the
// create event still in the queue.
{ type: "enable-sync", client: 1 },
{ type: "wait-for-dropped-create-response", client: 1 },
// The user then deletes the file locally and toggles sync off/on
// (the same flow the fuzz harness used). The disable's pause()
// does not clear the queue, but the re-enable runs an offline
// scan that calls clearPending() — wiping the dangling LocalCreate
// and any LocalDelete behind it. The local disk is empty, so
// nothing is enqueued.
{ type: "delete", client: 1, path: "file-32.md" },
{ type: "disable-sync", client: 1 },
{ type: "enable-sync", client: 1 },
// Catch-up on the new WS connection should deliver file-32 (vault
// update id 1) since client 1's lastSeenUpdateId is still 0.
{ type: "barrier" },
{
type: "assert-consistent",
verify: (s: AssertableState): void => {
s.assertFileExists("file-32.md").assertContent(
"file-32.md",
"hello"
);
}
}
]
};

View file

@ -1,4 +1,4 @@
#!/bin/bash
rm -rf /host/tmp/vaultlink-e2e-databases
rm -rf /tmp/vaultlink-e2e-databases
rm -rf logs

View file

@ -31,7 +31,7 @@ sleep 1
# Clean databases (uses tmpfs via /dev/shm for zero disk I/O)
echo "Cleaning databases..."
rm -rf /host/tmp/vaultlink-e2e-databases
rm -rf /tmp/databases
# Start the server in the background
echo "Starting server..."

View file

@ -793,23 +793,18 @@ impl Database {
.await
.context("Failed to commit transaction")?;
// For non-delete writes the originating device already has
// authoritative state from its HTTP response, so we tag the
// broadcast with `origin_device_id` and the send task in
// `websocket.rs` filters it out for that device. Deletes are
// delivered to *every* connected client including the author —
// the originator only removes the document from its sync queue
// once it receives this receipt.
// Broadcast every commit to every connected client, including
// the originator. The HTTP response is the originator's normal
// path to learn its own update, but if the response is lost
// (sync reset, dropped TCP) the broadcast is the only remaining
// delivery channel — and the client-side `parentVersionId`
// dedup absorbs the redundant message when the response made it
// through.
let envelope = WebSocketServerMessage::VaultUpdate(WebSocketVaultUpdate {
document: version.clone().into(),
});
let with_origin = if version.is_deleted {
WebSocketServerMessageWithOrigin::new(envelope)
} else {
WebSocketServerMessageWithOrigin::with_origin(version.device_id.clone(), envelope)
};
self.broadcasts
.send_document_update(vault_id, with_origin)?;
.send_document_update(vault_id, WebSocketServerMessageWithOrigin::new(envelope))?;
Ok(())
}

View file

@ -80,28 +80,13 @@ pub enum WebSocketServerMessage {
CursorPositions(CursorPositionFromServer),
}
/// Broadcast envelope carrying the message plus the device that produced
/// it. The per-recipient send task compares `origin_device_id` against
/// its own device id to fill in `originates_from_self` before the message
/// is serialized on the wire.
#[derive(Clone, Debug)]
pub struct WebSocketServerMessageWithOrigin {
pub origin_device_id: Option<DeviceId>,
pub message: WebSocketServerMessage,
}
impl WebSocketServerMessageWithOrigin {
pub fn new(message: WebSocketServerMessage) -> Self {
Self {
origin_device_id: None,
message,
}
}
pub fn with_origin(origin_device_id: DeviceId, message: WebSocketServerMessage) -> Self {
Self {
origin_device_id: Some(origin_device_id),
message,
}
Self { message }
}
}

View file

@ -208,14 +208,24 @@ async fn websocket(
loop {
match broadcast_receiver.recv().await {
Ok(update) => {
// Drop messages this device authored because the HTTP
// response already carried authoritative state back.
// Delete broadcasts are sent without an origin so the
// author also receives them — that's the receipt the
// client needs to drop the doc from its sync queue.
if Some(&device_id) == update.origin_device_id.as_ref() {
continue;
}
// Always deliver vault updates to the originating
// device too. The HTTP response is the *normal* path
// for the originator to learn its own update, and
// the client-side wire loop dedupes redundant
// broadcasts via the `parentVersionId` check. But
// when the response is lost mid-flight (sync reset,
// pause/resume, dropped TCP) the originator has no
// record of the doc; if the broadcast is also
// suppressed AND the next handshake's cursor was
// captured before the commit (cursor < vuid), the
// doc falls through both delivery paths and is
// stranded forever on the originator. Letting the
// self-broadcast through closes that window — the
// message processes as a remote create on the
// originator's reconnected WS and the file is
// restored. (Cursor messages still get the inner
// self-filter below; we drop our own cursor entries
// from the `clients` payload.)
// Filter out vault updates already covered by the
// catch-up snapshot. The handshake atomically