actually works
Some checks failed
Check / build (pull_request) Failing after 7s
E2E tests / build (pull_request) Failing after 6s
Publish CLI / publish-docker (pull_request) Failing after 4m59s
Publish server Docker image / publish-docker (pull_request) Failing after 35m13s

This commit is contained in:
Andras Schmelczer 2026-05-08 16:40:32 +01:00
parent fb71622e40
commit f2337dbbd0
7 changed files with 238 additions and 8 deletions

View file

@ -1 +0,0 @@
{"sessionId":"ba370285-50c8-425b-bbd6-21e8273dd73e","pid":77001,"procStart":"442537909","acquiredAt":1777926837871}

View file

@ -105,6 +105,7 @@ import { queuedCreateDeleteDoesNotHijackReusedPathTest } from "./tests/queued-cr
import { renamedPendingCreateReusedPathThenDeleteTest } from "./tests/renamed-pending-create-reused-path-then-delete.test";
import { renamePendingCreateOntoPendingDeletePathTest } from "./tests/rename-pending-create-onto-pending-delete-path.test";
import { remoteQuickWriteRenameBeforeRecordTest } from "./tests/remote-quick-write-rename-before-record.test";
import { selfMergePendingRenameAliasesSecondCreateTest } from "./tests/self-merge-pending-rename-aliases-second-create.test";
export const TESTS: Partial<Record<string, TestDefinition>> = {
"rename-create-conflict": renameCreateConflictTest,
@ -242,5 +243,7 @@ export const TESTS: Partial<Record<string, TestDefinition>> = {
"queued-create-delete-does-not-hijack-reused-path":
queuedCreateDeleteDoesNotHijackReusedPathTest,
"remote-quick-write-rename-before-record":
remoteQuickWriteRenameBeforeRecordTest
remoteQuickWriteRenameBeforeRecordTest,
"self-merge-pending-rename-aliases-second-create":
selfMergePendingRenameAliasesSecondCreateTest
};

View file

@ -0,0 +1,152 @@
import type { AssertableState } from "../utils/assertable-state";
import type { TestDefinition } from "../test-definition";
export const selfMergePendingRenameAliasesSecondCreateTest: TestDefinition = {
description:
"Single client makes two distinct creates that briefly share a path. " +
"Client 0 POSTs the first create at primary.md while the server is " +
"paused. While that POST is in flight: a second create is queued at " +
"staging.md, primary.md is renamed to moved.md (rewriting the in- " +
"flight create's event.path to moved.md and pushing a rename " +
"LocalUpdate at the queue tail), and staging.md is renamed onto the " +
"now-vacated primary.md slot (rewriting the second create's " +
"event.path to primary.md and pushing another rename LocalUpdate). " +
"Client 0's WS is paused throughout, so its watermark stays at 0. " +
"On resume the first POST commits Doc-X at primary.md (creation_vuid " +
"= N). The drain then processes the second LocalCreate (POST " +
"relativePath=primary.md, last_seen=0); the server's path-based " +
"dedup sees N > 0 and merges the second create into Doc-X " +
"(MergingUpdate). The buggy behaviour: processCreate's resolveCreate " +
"calls upsertRecord with localPath=primary.md, but the existing " +
"record (from the first create) already holds localPath=moved.md, " +
"and upsertRecord's `existing.localPath !== undefined` guard " +
"silently drops the new claim. The file at primary.md is left " +
"orphaned: tracked by no record, never broadcast, never deleted. " +
"After the user's renames the expected user-visible state is two " +
"distinct files at moved.md and primary.md — both clients must " +
"converge to that.",
clients: 2,
steps: [
// Both clients online so the WS connection is established before
// the test starts pausing things.
{ type: "enable-sync", client: 0 },
{ type: "enable-sync", client: 1 },
{ type: "barrier" },
// Pause client 0's WS so its MinCovered watermark stays at 0
// through the whole bug sequence. The merge condition the
// server is going to fire is `creation_vuid > last_seen`; with
// a non-zero gap the same-device second create gets merged
// into the same-device first create.
{ type: "pause-websocket", client: 0 },
// Client 1 commits a doc to push the server's vuid above 0.
// Without this filler, Doc-X's create vuid could be 1 and
// client 0's last_seen.add(1) would advance min to 1, killing
// the watermark gap that triggers the merge.
{
type: "create",
client: 1,
path: "filler.md",
content: "filler-content "
},
{ type: "sync", client: 1 },
// Pause the server so client 0's first create POST hangs in
// flight, giving us a deterministic window in which to enqueue
// the second create and the renames.
{ type: "pause-server" },
// First create — Doc-X. The wire-loop drains it, captures
// requestPath = event.path = "primary.md", reads the bytes,
// sends the POST, and stalls on the response.
{
type: "create",
client: 0,
path: "primary.md",
content: "primary content "
},
// Make sure the POST is actually on the wire with
// relativePath="primary.md" before we rewrite event.path.
// Without this delay the rename can win the race, the POST
// goes out with relativePath="moved.md", and the server-side
// path-collision merge never fires.
{ type: "sleep", ms: 100 },
// Second create at a staging path. The wire-loop is still
// blocked on Doc-X's POST, so this LocalCreate just queues at
// index 1.
{
type: "create",
client: 0,
path: "staging.md",
content: "secondary content "
},
// Rename Doc-X's path. enqueue's pending-create branch
// rewrites Doc-X's event.path in place (moved.md) and pushes
// a LocalUpdate(rename, originalPath=moved.md) at the END of
// the queue. Note the ordering: this LocalUpdate is enqueued
// AFTER the staging LocalCreate above. That ordering is
// load-bearing — it is what makes the second create's POST
// drain (and trigger the server-side merge) before Doc-X's
// rename PUT moves the doc away from primary.md on the
// server.
{
type: "rename",
client: 0,
oldPath: "primary.md",
newPath: "moved.md"
},
// Rename the staging file onto Doc-X's now-vacated primary.md
// slot. enqueue rewrites the staging LocalCreate's event.path
// to primary.md and pushes a LocalUpdate(rename,
// originalPath=primary.md) at the queue tail. After this the
// disk has: moved.md = Doc-X's bytes, primary.md = Doc-Y's
// bytes.
{
type: "rename",
client: 0,
oldPath: "staging.md",
newPath: "primary.md"
},
// Let everything fly: server processes the queued POSTs;
// client 0 catches up on broadcasts.
{ type: "resume-server" },
{ type: "resume-websocket", client: 0 },
{ type: "barrier" },
{
type: "assert-consistent",
verify: (state: AssertableState): void => {
// The user did two distinct creates (Doc-X and Doc-Y);
// both contents must survive on both clients.
state.assertFileCount(3);
state.assertFileExists("filler.md");
state.assertFileExists("moved.md");
state.assertFileExists("primary.md");
// After the renames the user expects:
// - moved.md = the file that was originally created
// at primary.md (Doc-X's content).
// - primary.md = the file that was originally created
// at staging.md (Doc-Y's content).
state.assertContains("moved.md", "primary content");
state.assertContains("primary.md", "secondary content");
// No content cross-contamination: each contribution
// should land in exactly one of the user-visible
// files. Under the bug, the orphan at primary.md
// carries Doc-X's content (because Doc-Y's PUT was
// aliased onto Doc-X's record and read Doc-X's bytes
// from moved.md), so this catches the leak too.
state.assertContentInAtMostOneFile("primary content");
state.assertContentInAtMostOneFile("secondary content");
}
}
]
};

View file

@ -5,6 +5,7 @@ import type { FileOperations } from "../file-operations/file-operations";
import { findMatchingFile } from "../utils/find-matching-file";
import type { SyncEventQueue } from "./sync-event-queue";
import { removeFromArray } from "../utils/remove-from-array";
import { FileNotFoundError } from "../errors/file-not-found-error";
/**
* Scans the local filesystem and the document database to determine
@ -77,8 +78,26 @@ export async function scheduleOfflineChanges(
}
const renamedPaths = new Set<RelativePath>();
// Track paths that were in `allLocalFiles` at scan-start but have
// since disappeared. The scan awaits between `listFilesRecursively`
// and each `read`, so a concurrent delete (slow file events, real
// user activity) can vacate a slot mid-scan. Throwing would abort
// the whole scan; nothing to sync for a file that's already gone.
const disappearedPaths = new Set<RelativePath>();
for (const path of locallyPossibleCreatedFiles) {
const content = await operations.read(path);
let content: Uint8Array;
try {
content = await operations.read(path);
} catch (e) {
if (e instanceof FileNotFoundError) {
logger.debug(
`File ${path} disappeared before offline-scan could read it; skipping`
);
disappearedPaths.add(path);
continue;
}
throw e;
}
const contentHash = await hash(content);
const matchingDeletedFile = await findMatchingFile(
@ -105,7 +124,7 @@ export async function scheduleOfflineChanges(
}
for (const path of locallyPossibleCreatedFiles) {
if (renamedPaths.has(path)) {
if (renamedPaths.has(path) || disappearedPaths.has(path)) {
continue;
}

View file

@ -189,6 +189,52 @@ export class SyncEventQueue {
this._lastSeenUpdateId.add(id);
}
/**
* Watermark to send with our own `POST /documents` requests.
*
* The contiguous-prefix `lastSeenUpdateId` lags behind reality whenever
* there are gaps in the vuid stream we've observed: if the server has
* committed vuids 1..N from various clients but we've only processed
* a non-contiguous subset, `min` stays at the last hole. The server's
* create handler reads this watermark to decide whether to merge a
* new POST into an existing doc at the same path:
*
* creation_vault_update_id > last_seen_vault_update_id merge
*
* That check is meant to fire only for docs the client genuinely
* couldn't have known about. But on a same-device "rename a
* pending-create away then create something else at that path" race,
* the second POST went out with `last_seen = min` while we already
* held a record for the first create at vuid=N and the server
* happily merged the second create into our own doc, aliasing two
* physically distinct local files onto a single docId.
*
* The fix is path-scoped: if we already track a doc whose
* `remoteRelativePath` matches the path we're about to POST, the
* server's existing doc at that path is exactly the one we'd alias
* into. Bumping `last_seen` to that record's `parentVersionId`
* forces the server's `creation_vuid > last_seen` check to fail and
* fall through to the deconflict path. For paths we don't yet
* track, we send the regular `min` watermark so a legitimate
* cross-device merge (two clients independently creating the same
* path) still fires when neither side holds a record for the
* collision target.
*/
public lastSeenUpdateIdForCreate(
requestPath: RelativePath
): VaultUpdateId {
let watermark = this._lastSeenUpdateId.min;
for (const record of this.byDocId.values()) {
if (
record.remoteRelativePath === requestPath &&
record.parentVersionId > watermark
) {
watermark = record.parentVersionId;
}
}
return watermark;
}
/**
* Pin an additional ignore pattern that survives setting reloads. Used
* by the Syncer to hide internal scratch paths (e.g. `.vaultlink/**`

View file

@ -500,9 +500,16 @@ export class Syncer {
// `updatePendingCreatePath` mutates queued creates when a not-yet-sent
// local file is renamed, so a renamed-away generation does not create
// a server document at a path that a newer local file has reused.
//
// `lastSeenUpdateIdForCreate(requestPath)` (rather than the contiguous
// `lastSeenUpdateId`) blocks the server from path-merging this POST
// into a doc we already track at the same path. Without that, a
// same-device rename race can alias two physically distinct local
// files onto one docId. See `SyncEventQueue.lastSeenUpdateIdForCreate`.
const response = await this.syncService.create({
relativePath: requestPath,
lastSeenVaultUpdateId: this.queue.lastSeenUpdateId,
lastSeenVaultUpdateId:
this.queue.lastSeenUpdateIdForCreate(requestPath),
contentBytes
});

View file

@ -41,9 +41,12 @@ echo "Server started with PID: $server_pid"
# Ensure server is killed on script exit
cleanup_server() {
echo "Stopping server (PID: $server_pid)..."
kill $server_pid 2>/dev/null || true
wait $server_pid 2>/dev/null || true
if [ -n "$server_pid" ]; then
echo "Stopping server (PID: $server_pid)..."
kill $server_pid 2>/dev/null || true
wait $server_pid 2>/dev/null || true
server_pid=""
fi
}
trap cleanup_server EXIT
@ -127,6 +130,7 @@ while true; do
done
if $all_done; then
cleanup_server
echo "All processes completed successfully"
exit 0
fi