diff --git a/cmd/msgvault/cmd/serve.go b/cmd/msgvault/cmd/serve.go index 14a4c5da..4d277f51 100644 --- a/cmd/msgvault/cmd/serve.go +++ b/cmd/msgvault/cmd/serve.go @@ -22,6 +22,7 @@ import ( "go.kenn.io/msgvault/internal/search" "go.kenn.io/msgvault/internal/store" "go.kenn.io/msgvault/internal/sync" + "go.kenn.io/msgvault/internal/syncerr" "golang.org/x/oauth2" ) @@ -468,6 +469,13 @@ func runScheduledGmailSync(ctx context.Context, email string, src *store.Source, } tokenSource, tsErr = oauthMgr.TokenSource(ctx, email) if tsErr != nil { + // Distinguish transient network failures (DNS lookup timeout, + // dial timeout after laptop sleep/wake, Wi-Fi flap) from real + // auth errors. Suggesting reauth on every network blip sends + // the user down the wrong path. + if syncerr.IsTransientNetwork(tsErr) { + return nil, fmt.Errorf("get token source: %w (transient network error; will retry on next schedule)", tsErr) + } if oauthMgr.HasToken(email) { return nil, fmt.Errorf("get token source: %w (token may be expired; run 'sync %s' or 'verify %s' from an interactive terminal to re-authorize)", tsErr, email, email) } diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index 53ff5a12..b2c8554b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -10,6 +10,7 @@ import ( "github.com/robfig/cron/v3" "go.kenn.io/msgvault/internal/config" + "go.kenn.io/msgvault/internal/syncerr" ) // SyncFunc is the callback invoked when a scheduled sync should run. @@ -275,10 +276,20 @@ func (s *Scheduler) runSync(email string) { s.mu.Lock() if err != nil { s.lastErr[email] = err - s.logger.Error("scheduled sync failed", - "email", email, - "duration", time.Since(start), - "error", err) + // Transient network failures (e.g., DNS lookup timeout after + // laptop sleep/wake) aren't actionable — the next scheduled tick + // will retry. Log at WARN to keep them out of error dashboards. + if syncerr.IsTransientNetwork(err) { + s.logger.Warn("scheduled sync skipped (transient network)", + "email", email, + "duration", time.Since(start), + "error", err) + } else { + s.logger.Error("scheduled sync failed", + "email", email, + "duration", time.Since(start), + "error", err) + } } else { s.lastRun[email] = time.Now() s.lastErr[email] = nil diff --git a/internal/store/store.go b/internal/store/store.go index dd5dcc9f..013cce9e 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -37,7 +37,13 @@ type Store struct { closeCleanup func() } -const defaultSQLiteParams = "?_journal_mode=WAL&_busy_timeout=30000&_synchronous=NORMAL&_foreign_keys=ON" +// synchronous=FULL + fullfsync=true protects WAL writes against OS/power crashes +// (NORMAL only protects against process crashes). msgvault is commonly run as a +// laptop daemon (`msgvault serve`) where sleep/wake, forced reboots, and OOM kills +// give many opportunities to leave a torn page on disk; the write volume is tiny +// so the durability cost is negligible. fullfsync is macOS-only (F_FULLFSYNC +// fcntl) and a no-op on other platforms. +const defaultSQLiteParams = "?_journal_mode=WAL&_busy_timeout=30000&_synchronous=FULL&_fullfsync=true&_foreign_keys=ON" // isSQLiteError checks if err is a sqlite3.Error with a message containing substr. // This is more robust than strings.Contains on err.Error() because it first @@ -63,6 +69,14 @@ func isPostgresURL(dbPath string) bool { return strings.HasPrefix(dbPath, "postgresql://") || strings.HasPrefix(dbPath, "postgres://") } +// testSQLiteParams configures SQLite for ephemeral test databases: WAL mode +// for concurrency parity with production, but synchronous=OFF (no fsync per +// commit). Test DBs live in t.TempDir() and are discarded at test exit, so +// durability against OS crashes is irrelevant — and on slow-fsync platforms +// like Windows CI runners, the production FULL setting can push bulk-import +// tests past their timing tripwires. +const testSQLiteParams = "?_journal_mode=WAL&_busy_timeout=30000&_synchronous=OFF&_foreign_keys=ON" + // Open opens or creates the database at the given path. // If dbPath is a postgres:// or postgresql:// URL, opens a PostgreSQL connection. // Otherwise, opens a SQLite database at the file path. @@ -70,11 +84,25 @@ func Open(dbPath string) (*Store, error) { if isPostgresURL(dbPath) { return openPostgres(dbPath) } - return openSQLite(dbPath) + return openSQLite(dbPath, defaultSQLiteParams) +} + +// OpenForTest opens or creates a database tuned for test use: ephemeral, +// fast, with durability disabled. PostgreSQL URLs go through the normal +// connection path (durability is a server-side concern there). +// +// Not for production use — a process crash mid-test can leave a corrupt +// database, which is fine because tests recreate it from scratch. +func OpenForTest(dbPath string) (*Store, error) { + if isPostgresURL(dbPath) { + return openPostgres(dbPath) + } + return openSQLite(dbPath, testSQLiteParams) } -// openSQLite opens a SQLite database at the given file path. -func openSQLite(dbPath string) (*Store, error) { +// openSQLite opens a SQLite database at the given file path with the +// supplied DSN parameters appended. +func openSQLite(dbPath, params string) (*Store, error) { // Ensure directory exists (skip for in-memory databases) if dbPath != ":memory:" && !strings.Contains(dbPath, ":memory:") { dir := filepath.Dir(dbPath) @@ -83,7 +111,7 @@ func openSQLite(dbPath string) (*Store, error) { } } - dsn := dbPath + defaultSQLiteParams + dsn := dbPath + params db, err := sql.Open("sqlite3", dsn) if err != nil { return nil, fmt.Errorf("open database: %w", err) diff --git a/internal/syncerr/transient.go b/internal/syncerr/transient.go new file mode 100644 index 00000000..e90c2e2e --- /dev/null +++ b/internal/syncerr/transient.go @@ -0,0 +1,50 @@ +// Package syncerr provides shared error classification for sync flows. +package syncerr + +import ( + "errors" + "net" + "strings" +) + +// IsTransientNetwork reports whether err (or anything it wraps) indicates a +// transient network problem rather than a logical or auth failure. It catches +// DNS lookup failures, dial timeouts, and connection refused — common after +// laptop sleep/wake or Wi-Fi flaps. Callers should retry on the next schedule +// instead of alerting the user; the underlying credentials are not invalid. +func IsTransientNetwork(err error) bool { + if err == nil { + return false + } + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) { + return true + } + var opErr *net.OpError + if errors.As(err, &opErr) { + return true + } + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + return true + } + // Some libraries (notably golang.org/x/oauth2) wrap transport errors + // with fmt.Errorf without %w, hiding the typed *url.Error / *net.OpError + // from errors.As. Fall back to substring matching for those cases. + s := err.Error() + switch { + case strings.Contains(s, "dial tcp"): + return true + case strings.Contains(s, "no such host"): + return true + case strings.Contains(s, "i/o timeout"): + return true + case strings.Contains(s, "connection refused"): + return true + case strings.Contains(s, "network is unreachable"): + return true + case strings.Contains(s, "no route to host"): + return true + } + return false +} diff --git a/internal/syncerr/transient_test.go b/internal/syncerr/transient_test.go new file mode 100644 index 00000000..6c04ef5d --- /dev/null +++ b/internal/syncerr/transient_test.go @@ -0,0 +1,70 @@ +package syncerr + +import ( + "errors" + "fmt" + "net" + "net/url" + "testing" +) + +func TestIsTransientNetwork(t *testing.T) { + dnsErr := &net.DNSError{Err: "no such host", Name: "oauth2.googleapis.com", IsNotFound: true} + dialTimeoutErr := &url.Error{ + Op: "Post", + URL: "https://oauth2.googleapis.com/token", + Err: &net.OpError{ + Op: "dial", + Net: "tcp", + Err: errors.New("i/o timeout"), + }, + } + + tests := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"plain string error", errors.New("something broke"), false}, + {"typed DNSError", dnsErr, true}, + {"typed url.Error wrapping dial timeout", dialTimeoutErr, true}, + {"wrapped DNSError via %w", fmt.Errorf("refresh token: %w", dnsErr), true}, + // oauth2 lib wraps with %v (not %w), so typed-unwrap fails; + // substring fallback must catch it. + { + "oauth2-style %v-wrapped dial error", + fmt.Errorf(`refresh token: Post "https://oauth2.googleapis.com/token": dial tcp: lookup oauth2.googleapis.com: i/o timeout`), + true, + }, + { + "no such host substring", + fmt.Errorf(`refresh token: Post "...": dial tcp: lookup oauth2.googleapis.com: no such host`), + true, + }, + { + "connection refused substring", + errors.New("dial tcp 127.0.0.1:443: connection refused"), + true, + }, + { + "actual auth error (invalid_grant) is NOT transient", + errors.New(`oauth2: "invalid_grant" "Token has been expired or revoked."`), + false, + }, + { + "generic SQL error is NOT transient", + errors.New("database disk image is malformed"), + false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := IsTransientNetwork(tc.err) + if got != tc.want { + t.Errorf("IsTransientNetwork(%v) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/internal/testutil/store_helpers.go b/internal/testutil/store_helpers.go index f56fcf0e..25d76544 100644 --- a/internal/testutil/store_helpers.go +++ b/internal/testutil/store_helpers.go @@ -31,7 +31,7 @@ func NewTestStore(t *testing.T) *store.Store { } dbPath := filepath.Join(t.TempDir(), "test.db") - st, err := store.Open(dbPath) + st, err := store.OpenForTest(dbPath) if err != nil { t.Fatalf("open store: %v", err) }