use crate::{ app_state::{ AppState, database::models::VaultId, websocket::{ models::{ CursorPositionFromServer, WebSocketClientMessage, WebSocketServerMessage, WebSocketVaultUpdate, }, utils::{ get_authenticated_handshake, get_unseen_documents, send_update_over_websocket, }, }, }, consts::{ HANDSHAKE_TIMEOUT, MAX_CURSOR_DOCUMENTS, MAX_CURSORS_PER_DOCUMENT, MAX_RELATIVE_PATH_LEN, }, errors::{SyncServerError, client_error, server_error}, utils::normalize::normalize, }; use anyhow::Context; use axum::{ extract::{ Path, State, ws::{Message, WebSocket, WebSocketUpgrade}, }, response::Response, }; use futures::sink::SinkExt; use futures::stream::StreamExt; use log::{debug, info, warn}; use serde::Deserialize; /// Tracks a pending (not yet authenticated) WebSocket connection. /// Decrements the counter when dropped, ensuring cleanup even if /// the upgrade never completes or auth fails. struct PendingWsGuard(std::sync::Arc); impl Drop for PendingWsGuard { fn drop(&mut self) { self.0.fetch_sub(1, std::sync::atomic::Ordering::Relaxed); } } #[derive(Deserialize)] pub struct WebSocketPathParams { #[serde(deserialize_with = "normalize")] vault_id: VaultId, } pub async fn websocket_handler( ws: WebSocketUpgrade, Path(WebSocketPathParams { vault_id }): Path, State(state): State, ) -> Result { let current = state .pending_ws_connections .fetch_add(1, std::sync::atomic::Ordering::Relaxed); if current >= state.config.server.max_pending_websocket_connections { state .pending_ws_connections .fetch_sub(1, std::sync::atomic::Ordering::Relaxed); return Err(client_error(anyhow::anyhow!( "Too many pending WebSocket connections" ))); } let guard = PendingWsGuard(state.pending_ws_connections.clone()); Ok(ws.on_upgrade(move |socket| websocket_wrapped(state, socket, vault_id, guard))) } async fn websocket_wrapped( state: AppState, stream: WebSocket, vault_id: VaultId, pending_guard: PendingWsGuard, ) { info!("WebSocket connection opened on vault `{vault_id}`"); let result = websocket(state, stream, vault_id.clone(), pending_guard).await; if let Err(err) = result { debug!("WebSocket connection error on vault `{vault_id}`: {err}"); } } #[allow(clippy::too_many_lines)] async fn websocket( state: AppState, stream: WebSocket, vault_id: VaultId, pending_guard: PendingWsGuard, ) -> Result<(), SyncServerError> { let (mut sender, mut websocket_receiver) = stream.split(); let handshake_msg = tokio::time::timeout(HANDSHAKE_TIMEOUT, websocket_receiver.next()) .await .map_err(|_| client_error(anyhow::anyhow!("WebSocket handshake timed out")))? .transpose() .map_err(|e| client_error(anyhow::anyhow!("WebSocket error during handshake: {e}")))?; let authed_handshake = get_authenticated_handshake(&state, &vault_id, handshake_msg)?; info!( "WebSocket handshake successful for vault `{vault_id}` for `{}`", authed_handshake.handshake.device_id ); // Auth complete — no longer a pending connection. drop(pending_guard); let max_clients = state.config.server.max_clients_per_vault; // Atomic subscribe + cursor snapshot, serialized against in-flight // broadcasts: // // 1. Acquire the per-vault broadcast send lock. While we hold it, // no `send_document_update` can run, so no broadcast can fire // between our subscribe and our cursor snapshot. // 2. Subscribe to the broadcast channel (now we'll see every // broadcast that fires after we drop the send guard). // 3. Snapshot `cursor = max committed vault_update_id`. Because // `insert_document_version` holds the same send lock from // *before* the commit through *after* the broadcast, every doc // visible at this cursor has either (a) already had its // broadcast delivered to all then-existing subscribers — and we // weren't one of them, so we'll catch it via the snapshot — or // (b) had its broadcast contend on the lock we're holding, and // will be delivered to us as soon as we drop the guard, with // `vault_update_id > cursor`. // 4. Drop the send guard so writers can resume broadcasting. // 5. Stream the catch-up bounded by the cursor — i.e. only docs // with `vault_update_id <= cursor` — exactly once. // 6. The send task forwards broadcasts but filters to // `vault_update_id > cursor`, so a doc that's both in the // catch-up and in a contended-then-released broadcast is // delivered exactly once (via the catch-up). let send_guard = state.broadcasts.acquire_send_lock(&vault_id).await; let mut broadcast_receiver = match state.broadcasts.get_receiver(&vault_id, max_clients) { Ok(receiver) => receiver, Err(err) => { drop(send_guard); warn!( "Vault `{vault_id}` has reached the maximum number of clients ({max_clients}), rejecting connection from `{}`", authed_handshake.handshake.device_id ); if let Err(e) = sender .send(Message::Close(Some(axum::extract::ws::CloseFrame { code: 4000, reason: format!( "Vault has reached the maximum number of clients ({max_clients})" ) .into(), }))) .await { warn!("Failed to send WebSocket close frame: {e}"); } return Err(err); } }; let cursor = state .database .get_max_update_id_in_vault(&vault_id, None) .await .map_err(server_error)?; drop(send_guard); // Catch-up on versions committed while this client was offline, // streamed one-at-a-time in ascending `vault_update_id` order, up // to the snapshot cursor. let unseen_documents = get_unseen_documents( &state, &vault_id, authed_handshake.handshake.last_seen_vault_update_id, cursor, ) .await?; let unseen_summary: Vec<(i64, bool, String)> = unseen_documents .iter() .map(|d| (d.vault_update_id, d.is_deleted, d.relative_path.clone())) .collect(); info!( "[CATCHUP] vault={vault_id} device={} last_seen={:?} cursor={cursor} unseen_count={} unseen={:?}", authed_handshake.handshake.device_id, authed_handshake.handshake.last_seen_vault_update_id, unseen_summary.len(), unseen_summary ); for document in unseen_documents { send_update_over_websocket( &WebSocketServerMessage::VaultUpdate(WebSocketVaultUpdate { document }), &mut sender, ) .await?; } send_update_over_websocket( &WebSocketServerMessage::CursorPositions(CursorPositionFromServer { clients: state.cursors.get_cursors(&vault_id).await, }), &mut sender, ) .await?; let device_id = authed_handshake.handshake.device_id.clone(); let mut send_task = tokio::spawn(async move { loop { match broadcast_receiver.recv().await { Ok(update) => { // Drop messages this device authored because the HTTP // response already carried authoritative state back. // Delete broadcasts are sent without an origin so the // author also receives them — that's the receipt the // client needs to drop the doc from its sync queue. if Some(&device_id) == update.origin_device_id.as_ref() { continue; } // Filter out vault updates already covered by the // catch-up snapshot. The handshake atomically // subscribed and snapshotted `cursor` under the // broadcast send lock, so any broadcast with // `vault_update_id <= cursor` is one that contended // on the lock during our subscribe — its row is // already in the catch-up stream and re-delivering // it via this channel would duplicate the message. // Cursor messages aren't versioned and are always // forwarded. if let WebSocketServerMessage::VaultUpdate(WebSocketVaultUpdate { document }) = &update.message && document.vault_update_id <= cursor { continue; } let message = match update.message { WebSocketServerMessage::CursorPositions(CursorPositionFromServer { clients, }) => WebSocketServerMessage::CursorPositions(CursorPositionFromServer { clients: clients .into_iter() .filter(|client| client.device_id != device_id) .collect(), }), WebSocketServerMessage::VaultUpdate(_) => update.message, }; send_update_over_websocket(&message, &mut sender).await?; } Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { warn!( "WebSocket receiver lagged, dropped {n} messages — disconnecting client to force full resync" ); break; } Err(tokio::sync::broadcast::error::RecvError::Closed) => break, } } Ok::<(), SyncServerError>(()) }); let device_id = authed_handshake.handshake.device_id.clone(); let vault_id_clone = vault_id.clone(); let cursor_manager = state.cursors.clone(); let mut receive_task = tokio::spawn(async move { while let Some(msg) = websocket_receiver.next().await { match msg { Ok(Message::Text(message)) => { let message: WebSocketClientMessage = serde_json::from_str(&message) .context("Failed to parse WebSocket message from client") .map_err(client_error)?; match message { WebSocketClientMessage::Handshake(_) => { return Err(client_error(anyhow::anyhow!( "Unexpected handshake message" ))); } WebSocketClientMessage::CursorPositions(cursors) => { let docs = cursors.documents_with_cursors; if docs.len() > MAX_CURSOR_DOCUMENTS { warn!( "Cursor update rejected: {} documents exceeds limit of {MAX_CURSOR_DOCUMENTS}", docs.len() ); continue; } let valid = docs.iter().all(|doc| { doc.cursors.len() <= MAX_CURSORS_PER_DOCUMENT && doc.relative_path.len() <= MAX_RELATIVE_PATH_LEN }); if !valid { warn!( "Cursor update rejected: a document exceeds cursor or path length limits" ); continue; } cursor_manager .update_cursors( vault_id_clone.clone(), authed_handshake.user.name.clone(), &device_id, docs, ) .await; } } } Ok(Message::Close(_)) => break, Ok(Message::Binary(_)) => { warn!("Received unexpected binary WebSocket message, ignoring"); } Ok(_) => {} // Ping/Pong frames handled by axum Err(e) => { debug!("WebSocket receive error: {e}"); break; } } } Ok::<(), SyncServerError>(()) }); let result: Result<(), SyncServerError> = tokio::select! { send_result = &mut send_task => { receive_task.abort(); let _ = receive_task.await; match send_result { Err(e) => Err(server_error( anyhow::Error::from(e).context("WebSocket send task failed"), )), Ok(inner) => inner, } }, receive_result = &mut receive_task => { send_task.abort(); let _ = send_task.await; match receive_result { Err(e) => Err(server_error( anyhow::Error::from(e).context("WebSocket receive task failed"), )), Ok(inner) => inner, } }, }; state .cursors .remove_cursors_of_device(&vault_id, &authed_handshake.handshake.device_id) .await; match &result { Ok(()) => { info!( "WebSocket disconnected on vault `{vault_id}` for `{}`", authed_handshake.handshake.device_id ); } Err(err) => { warn!( "WebSocket error on vault `{vault_id}` for `{}`: {err}", authed_handshake.handshake.device_id ); } } result }