From ad3a5a84f933f563697515df71579df315e7d905 Mon Sep 17 00:00:00 2001 From: Rob Elkin Date: Sun, 24 May 2026 14:13:41 -0500 Subject: [PATCH] sync: harden SQLite durability and stop blaming the token for network blips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - store: switch SQLite DSN to synchronous=FULL + fullfsync=true. NORMAL only protects against process crashes; FULL protects against OS/power crashes. Matters for `msgvault serve` on laptops (sleep/wake, forced reboots, OOM) where torn writes have been corrupting sync_runs — the most write-heavy table. Write volume is tiny, so the durability cost is negligible. - store: add OpenForTest with synchronous=OFF for fast test DBs. synchronous=FULL adds an fsync per commit, which on Windows CI runners pushed bulk-import tests like TestImportDYI_TimingTripwire past their 30-second timing tripwires (~50s on the Windows job). Tests use ephemeral t.TempDir() databases that get discarded at exit, so OS-crash durability is irrelevant. openSQLite now takes a params string so the test path can opt into a fast DSN; testutil.NewTestStore calls store.OpenForTest while production callers continue through store.Open with the durable DSN. - serve: distinguish transient network errors (DNS lookup timeout, dial timeout, no route) from real auth errors when refreshing tokens. The old message blamed the token on every failure, telling users to re-authorize perfectly valid tokens after a Wi-Fi flap. - scheduler: downgrade transient-network sync failures from ERROR to WARN so a laptop wake-up doesn't paint dashboards red. - syncerr: new shared package classifying transient network errors (typed net.DNSError/net.OpError plus substring fallback for libraries that wrap with %v instead of %w). Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/msgvault/cmd/serve.go | 8 ++++ internal/scheduler/scheduler.go | 19 ++++++-- internal/store/store.go | 38 +++++++++++++--- internal/syncerr/transient.go | 50 +++++++++++++++++++++ internal/syncerr/transient_test.go | 70 ++++++++++++++++++++++++++++++ internal/testutil/store_helpers.go | 2 +- 6 files changed, 177 insertions(+), 10 deletions(-) create mode 100644 internal/syncerr/transient.go create mode 100644 internal/syncerr/transient_test.go diff --git a/cmd/msgvault/cmd/serve.go b/cmd/msgvault/cmd/serve.go index 14a4c5da..4d277f51 100644 --- a/cmd/msgvault/cmd/serve.go +++ b/cmd/msgvault/cmd/serve.go @@ -22,6 +22,7 @@ import ( "go.kenn.io/msgvault/internal/search" "go.kenn.io/msgvault/internal/store" "go.kenn.io/msgvault/internal/sync" + "go.kenn.io/msgvault/internal/syncerr" "golang.org/x/oauth2" ) @@ -468,6 +469,13 @@ func runScheduledGmailSync(ctx context.Context, email string, src *store.Source, } tokenSource, tsErr = oauthMgr.TokenSource(ctx, email) if tsErr != nil { + // Distinguish transient network failures (DNS lookup timeout, + // dial timeout after laptop sleep/wake, Wi-Fi flap) from real + // auth errors. Suggesting reauth on every network blip sends + // the user down the wrong path. + if syncerr.IsTransientNetwork(tsErr) { + return nil, fmt.Errorf("get token source: %w (transient network error; will retry on next schedule)", tsErr) + } if oauthMgr.HasToken(email) { return nil, fmt.Errorf("get token source: %w (token may be expired; run 'sync %s' or 'verify %s' from an interactive terminal to re-authorize)", tsErr, email, email) } diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 53ff5a12..b2c8554b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -10,6 +10,7 @@ import ( "github.com/robfig/cron/v3" "go.kenn.io/msgvault/internal/config" + "go.kenn.io/msgvault/internal/syncerr" ) // SyncFunc is the callback invoked when a scheduled sync should run. @@ -275,10 +276,20 @@ func (s *Scheduler) runSync(email string) { s.mu.Lock() if err != nil { s.lastErr[email] = err - s.logger.Error("scheduled sync failed", - "email", email, - "duration", time.Since(start), - "error", err) + // Transient network failures (e.g., DNS lookup timeout after + // laptop sleep/wake) aren't actionable — the next scheduled tick + // will retry. Log at WARN to keep them out of error dashboards. + if syncerr.IsTransientNetwork(err) { + s.logger.Warn("scheduled sync skipped (transient network)", + "email", email, + "duration", time.Since(start), + "error", err) + } else { + s.logger.Error("scheduled sync failed", + "email", email, + "duration", time.Since(start), + "error", err) + } } else { s.lastRun[email] = time.Now() s.lastErr[email] = nil diff --git a/internal/store/store.go b/internal/store/store.go index dd5dcc9f..013cce9e 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -37,7 +37,13 @@ type Store struct { closeCleanup func() } -const defaultSQLiteParams = "?_journal_mode=WAL&_busy_timeout=30000&_synchronous=NORMAL&_foreign_keys=ON" +// synchronous=FULL + fullfsync=true protects WAL writes against OS/power crashes +// (NORMAL only protects against process crashes). msgvault is commonly run as a +// laptop daemon (`msgvault serve`) where sleep/wake, forced reboots, and OOM kills +// give many opportunities to leave a torn page on disk; the write volume is tiny +// so the durability cost is negligible. fullfsync is macOS-only (F_FULLFSYNC +// fcntl) and a no-op on other platforms. +const defaultSQLiteParams = "?_journal_mode=WAL&_busy_timeout=30000&_synchronous=FULL&_fullfsync=true&_foreign_keys=ON" // isSQLiteError checks if err is a sqlite3.Error with a message containing substr. // This is more robust than strings.Contains on err.Error() because it first @@ -63,6 +69,14 @@ func isPostgresURL(dbPath string) bool { return strings.HasPrefix(dbPath, "postgresql://") || strings.HasPrefix(dbPath, "postgres://") } +// testSQLiteParams configures SQLite for ephemeral test databases: WAL mode +// for concurrency parity with production, but synchronous=OFF (no fsync per +// commit). Test DBs live in t.TempDir() and are discarded at test exit, so +// durability against OS crashes is irrelevant — and on slow-fsync platforms +// like Windows CI runners, the production FULL setting can push bulk-import +// tests past their timing tripwires. +const testSQLiteParams = "?_journal_mode=WAL&_busy_timeout=30000&_synchronous=OFF&_foreign_keys=ON" + // Open opens or creates the database at the given path. // If dbPath is a postgres:// or postgresql:// URL, opens a PostgreSQL connection. // Otherwise, opens a SQLite database at the file path. @@ -70,11 +84,25 @@ func Open(dbPath string) (*Store, error) { if isPostgresURL(dbPath) { return openPostgres(dbPath) } - return openSQLite(dbPath) + return openSQLite(dbPath, defaultSQLiteParams) +} + +// OpenForTest opens or creates a database tuned for test use: ephemeral, +// fast, with durability disabled. PostgreSQL URLs go through the normal +// connection path (durability is a server-side concern there). +// +// Not for production use — a process crash mid-test can leave a corrupt +// database, which is fine because tests recreate it from scratch. +func OpenForTest(dbPath string) (*Store, error) { + if isPostgresURL(dbPath) { + return openPostgres(dbPath) + } + return openSQLite(dbPath, testSQLiteParams) } -// openSQLite opens a SQLite database at the given file path. -func openSQLite(dbPath string) (*Store, error) { +// openSQLite opens a SQLite database at the given file path with the +// supplied DSN parameters appended. +func openSQLite(dbPath, params string) (*Store, error) { // Ensure directory exists (skip for in-memory databases) if dbPath != ":memory:" && !strings.Contains(dbPath, ":memory:") { dir := filepath.Dir(dbPath) @@ -83,7 +111,7 @@ func openSQLite(dbPath string) (*Store, error) { } } - dsn := dbPath + defaultSQLiteParams + dsn := dbPath + params db, err := sql.Open("sqlite3", dsn) if err != nil { return nil, fmt.Errorf("open database: %w", err) diff --git a/internal/syncerr/transient.go b/internal/syncerr/transient.go new file mode 100644 index 00000000..e90c2e2e --- /dev/null +++ b/internal/syncerr/transient.go @@ -0,0 +1,50 @@ +// Package syncerr provides shared error classification for sync flows. +package syncerr + +import ( + "errors" + "net" + "strings" +) + +// IsTransientNetwork reports whether err (or anything it wraps) indicates a +// transient network problem rather than a logical or auth failure. It catches +// DNS lookup failures, dial timeouts, and connection refused — common after +// laptop sleep/wake or Wi-Fi flaps. Callers should retry on the next schedule +// instead of alerting the user; the underlying credentials are not invalid. +func IsTransientNetwork(err error) bool { + if err == nil { + return false + } + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) { + return true + } + var opErr *net.OpError + if errors.As(err, &opErr) { + return true + } + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + return true + } + // Some libraries (notably golang.org/x/oauth2) wrap transport errors + // with fmt.Errorf without %w, hiding the typed *url.Error / *net.OpError + // from errors.As. Fall back to substring matching for those cases. + s := err.Error() + switch { + case strings.Contains(s, "dial tcp"): + return true + case strings.Contains(s, "no such host"): + return true + case strings.Contains(s, "i/o timeout"): + return true + case strings.Contains(s, "connection refused"): + return true + case strings.Contains(s, "network is unreachable"): + return true + case strings.Contains(s, "no route to host"): + return true + } + return false +} diff --git a/internal/syncerr/transient_test.go b/internal/syncerr/transient_test.go new file mode 100644 index 00000000..6c04ef5d --- /dev/null +++ b/internal/syncerr/transient_test.go @@ -0,0 +1,70 @@ +package syncerr + +import ( + "errors" + "fmt" + "net" + "net/url" + "testing" +) + +func TestIsTransientNetwork(t *testing.T) { + dnsErr := &net.DNSError{Err: "no such host", Name: "oauth2.googleapis.com", IsNotFound: true} + dialTimeoutErr := &url.Error{ + Op: "Post", + URL: "https://oauth2.googleapis.com/token", + Err: &net.OpError{ + Op: "dial", + Net: "tcp", + Err: errors.New("i/o timeout"), + }, + } + + tests := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"plain string error", errors.New("something broke"), false}, + {"typed DNSError", dnsErr, true}, + {"typed url.Error wrapping dial timeout", dialTimeoutErr, true}, + {"wrapped DNSError via %w", fmt.Errorf("refresh token: %w", dnsErr), true}, + // oauth2 lib wraps with %v (not %w), so typed-unwrap fails; + // substring fallback must catch it. + { + "oauth2-style %v-wrapped dial error", + fmt.Errorf(`refresh token: Post "https://oauth2.googleapis.com/token": dial tcp: lookup oauth2.googleapis.com: i/o timeout`), + true, + }, + { + "no such host substring", + fmt.Errorf(`refresh token: Post "...": dial tcp: lookup oauth2.googleapis.com: no such host`), + true, + }, + { + "connection refused substring", + errors.New("dial tcp 127.0.0.1:443: connection refused"), + true, + }, + { + "actual auth error (invalid_grant) is NOT transient", + errors.New(`oauth2: "invalid_grant" "Token has been expired or revoked."`), + false, + }, + { + "generic SQL error is NOT transient", + errors.New("database disk image is malformed"), + false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := IsTransientNetwork(tc.err) + if got != tc.want { + t.Errorf("IsTransientNetwork(%v) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/internal/testutil/store_helpers.go b/internal/testutil/store_helpers.go index f56fcf0e..25d76544 100644 --- a/internal/testutil/store_helpers.go +++ b/internal/testutil/store_helpers.go @@ -31,7 +31,7 @@ func NewTestStore(t *testing.T) *store.Store { } dbPath := filepath.Join(t.TempDir(), "test.db") - st, err := store.Open(dbPath) + st, err := store.OpenForTest(dbPath) if err != nil { t.Fatalf("open store: %v", err) }