Fix syncing when network latency is present (#4)

* WIP * Add debug * Dedupe inserts * Add deterministic ordering * Fix whitespaces * Update insta * Add integration test script * Rename * Add test * Working for non-deletes * omg it mostly works for deletes * Isdeleted fix * remove created dates * update api * Take document id * No max attempt * works * Use string uuids * . * working!!!! (hopefully) * Improve bundling * Add module * lint * . * lint * Fix CI * use toolchain * clean up * Add useSlowFileEvents * Delete fuzz * Fix CI * use docker * fix script * clean up * Clean up * change node version * Build docker image on every commit * fix ci * 1 db per vault * Add scritps folder * Bump versions * Lint * . * Fix tests for real * Style * . * try * Consistent ordering * Fix tests * hmm * . * Clean up diff * Fixes * . * Fix version bump * . * . * .
2025-03-16 20:13:49 +00:00 · 2025-03-16 20:13:49 +00:00 · 8b8f1d91d9
commit 8b8f1d91d9
parent bcf48c428d
91 changed files with 2252 additions and 1586 deletions
--- a/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-2.snap
+++ b/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-2.snap
@ -0,0 +1,6 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\"\")"
+snapshot_kind: text
+---
+[]
--- a/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-3.snap
+++ b/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-3.snap
@ -0,0 +1,15 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\" what? \")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "what?",
+        original: " what?",
+    },
+    Token {
+        normalised: "",
+        original: " ",
+    },
+]
--- a/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-4.snap
+++ b/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots-4.snap
@ -0,0 +1,23 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\" hello, \\nwhere are you?\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "hello,",
+        original: " hello,",
+    },
+    Token {
+        normalised: "where",
+        original: " \nwhere",
+    },
+    Token {
+        normalised: "are",
+        original: " are",
+    },
+    Token {
+        normalised: "you?",
+        original: " you?",
+    },
+]
--- a/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots.snap
+++ b/backend/reconcile/src/tokenizer/snapshots/reconciletokenizerword_tokenizertestswith_snapshots.snap
@ -0,0 +1,15 @@
+---
+source: reconcile/src/tokenizer/word_tokenizer.rs
+expression: "word_tokenizer(\"Hi there!\")"
+snapshot_kind: text
+---
+[
+    Token {
+        normalised: "Hi",
+        original: "Hi",
+    },
+    Token {
+        normalised: "there!",
+        original: " there!",
+    },
+]
--- a/backend/reconcile/src/tokenizer/token.rs
+++ b/backend/reconcile/src/tokenizer/token.rs
@ -8,24 +8,19 @@ use serde::{Deserialize, Serialize};
 #[derive(Debug, Clone)]
 pub struct Token<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    normalised: T,
    original: String,
 }

 impl From<&str> for Token<String> {
-    fn from(s: &str) -> Self {
-        Token {
-            normalised: s.to_owned(),
-            original: s.to_owned(),
-        }
-    }
+    fn from(s: &str) -> Self { Token::new(s.trim().to_owned(), s.to_owned()) }
 }

 impl<T> Token<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    pub fn new(normalised: T, original: String) -> Self {
        Token {
@ -43,7 +38,7 @@ where

 impl<T> PartialEq for Token<T>
 where
-    T: PartialEq + Clone,
+    T: PartialEq + Clone + std::fmt::Debug,
 {
    fn eq(&self, other: &Self) -> bool { self.normalised == other.normalised }
 }
--- a/backend/reconcile/src/tokenizer/word_tokenizer.rs
+++ b/backend/reconcile/src/tokenizer/word_tokenizer.rs
@ -1,7 +1,48 @@
 use super::token::Token;

+/// Splits on whitespace keeping the leading whitespace.
+///
+///     
+/// ## Example
+///
+/// "Hi there!" -> ["Hi", " there!"]
 pub fn word_tokenizer(text: &str) -> Vec<Token<String>> {
-    text.split_inclusive(char::is_whitespace)
-        .map(|s| Token::new(s.to_owned(), s.to_owned()))
-        .collect()
+    let mut result: Vec<Token<String>> = Vec::new();
+
+    let mut last_whitespace = 0;
+    let mut previous_char_is_whitespace = true;
+
+    for (i, c) in text.char_indices() {
+        let is_current_char_whitespace = c.is_whitespace();
+        if !previous_char_is_whitespace && is_current_char_whitespace {
+            result.push(text[last_whitespace..i].into());
+            last_whitespace = i;
+        }
+
+        previous_char_is_whitespace = is_current_char_whitespace;
+    }
+
+    if last_whitespace < text.len() {
+        result.push(text[last_whitespace..].into());
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use insta::assert_debug_snapshot;
+
+    use super::*;
+
+    #[test]
+    fn test_with_snapshots() {
+        assert_debug_snapshot!(word_tokenizer("Hi there!"));
+
+        assert_debug_snapshot!(word_tokenizer(""));
+
+        assert_debug_snapshot!(word_tokenizer(" what? "));
+
+        assert_debug_snapshot!(word_tokenizer(" hello, \nwhere are you?"));
+    }
 }