Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/postgres.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ db.postgres.timeout=0
Connection timeout is disabled, to account for situations where the database
might be slow for unexpected reasons.

### Startup retry

By default, LND retries connecting to Postgres at startup if the database is
not yet available. This is useful in environments like Kubernetes where the
database container may not be ready when LND starts.

* `db.postgres.startup-max-retries=5` sets the maximum number of connection
attempts. Set to `0` to disable retries and fail immediately.
* `db.postgres.startup-retry-delay=1s` sets the delay between retry attempts.

Moreover for particular kv tables we also add the option to access the
tables via a global lock (single wirter). This is a temorpary measure until
these particular tables have a native sql schema. This helps to mitigate
Expand Down
6 changes: 6 additions & 0 deletions docs/release-notes/release-notes-0.21.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,19 @@
comprehensive tests. The migration is currently dev-only, compiled behind
the `test_db_postgres`, `test_db_sqlite`, or `test_native_sql` build tags.

* Added [WaitForPostgresReady](https://github.com/lightningnetwork/lnd/pull/10564)
to wait for the Postgres database to become available before opening the
connection. This is necessary in environments where the database may not be
ready when LND starts. Also added `startup-max-retries` and
`startup-retry-delay` configuration options to control the retry behavior.

## Code Health

## Tooling and Documentation

# Contributors (Alphabetical Order)

* Abdulkbk
* bitromortac
* Boris Nagaev
* Elle Mouton
Expand Down
17 changes: 16 additions & 1 deletion lncfg/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ const (

defaultSqliteBusyTimeout = 5 * time.Second

defaultPostgresStartupMaxRetries = 5
defaultPostgresStartupRetryDelay = 1 * time.Second

// NSChannelDB is the namespace name that we use for the combined graph
// and channel state DB.
NSChannelDB = "channeldb"
Expand Down Expand Up @@ -114,7 +117,9 @@ func DefaultDB() *DB {
MaxMsgSize: 32768 * 1024,
},
Postgres: &sqldb.PostgresConfig{
MaxConnections: defaultPostgresMaxConnections,
MaxConnections: defaultPostgresMaxConnections,
StartupMaxRetries: defaultPostgresStartupMaxRetries,
StartupRetryDelay: defaultPostgresStartupRetryDelay,
// Normally we don't use a global lock for channeldb
// access, but if a user encounters huge concurrency
// issues, they can enable this to use a global lock.
Expand Down Expand Up @@ -403,6 +408,16 @@ func (db *DB) GetBackends(ctx context.Context, chanDBPath,
}, nil

case PostgresBackend:
// Wait for the Postgres database to become available before
// opening the connection. This is necessary in environments
// where the database may not be ready when LND starts.
if err := sqldb.WaitForPostgresReady(
ctx, db.Postgres,
); err != nil {
return nil, fmt.Errorf("error waiting for postgres "+
"to become available: %v", err)
}

// Convert the sqldb PostgresConfig to a kvdb postgres.Config.
// This is a temporary measure until we migrate all kvdb SQL
// users to native SQL.
Expand Down
11 changes: 11 additions & 0 deletions sample-lnd.conf
Original file line number Diff line number Diff line change
Expand Up @@ -1626,6 +1626,17 @@
; Example:
; db.postgres.maxconnections=

; Maximum number of times to retry connecting to the database at startup. This
; is useful in environments like Kubernetes where the database container may not
; be ready when LND starts. Set to zero to disable retries.
; Default:
; db.postgres.startup-max-retries=5

; The delay between connection retry attempts at startup. Valid time units are
; {s, m, h}.
; Default:
; db.postgres.startup-retry-delay=1s

; Whether to skip executing schema migrations.
; db.postgres.skipmigrations=false

Expand Down
7 changes: 7 additions & 0 deletions sqldb/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ type PostgresConfig struct {
Dsn string `long:"dsn" description:"Database connection string."`
Timeout time.Duration `long:"timeout" description:"Database connection timeout. Set to zero to disable."`
MaxConnections int `long:"maxconnections" description:"The maximum number of open connections to the database. Set to zero for unlimited."`
StartupMaxRetries int `long:"startup-max-retries" description:"Maximum number of times to retry connecting to the database at startup. Set to zero to disable retries."`
StartupRetryDelay time.Duration `long:"startup-retry-delay" description:"The delay between connection retry attempts at startup."`
SkipMigrations bool `long:"skipmigrations" description:"Skip applying migrations on startup."`
ChannelDBWithGlobalLock bool `long:"channeldb-with-global-lock" description:"Use a global lock for channeldb access. This ensures only a single writer at a time but reduces concurrency. This is a temporary workaround until the revocation log is migrated to a native sql schema."`
WalletDBWithGlobalLock bool `long:"walletdb-with-global-lock" description:"Use a global lock for wallet database access. This ensures only a single writer at a time but reduces concurrency. This is a temporary workaround until the wallet subsystem is upgraded to a native sql schema."`
Expand All @@ -65,6 +67,11 @@ func (p *PostgresConfig) Validate() error {
return fmt.Errorf("invalid DSN: %w", err)
}

if p.StartupMaxRetries > 0 && p.StartupRetryDelay <= 0 {
return fmt.Errorf("startup retry delay must be positive " +
"when retries are enabled")
}

if err := p.QueryConfig.Validate(false); err != nil {
return fmt.Errorf("invalid query config: %w", err)
}
Expand Down
50 changes: 50 additions & 0 deletions sqldb/postgres.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,56 @@ func (s *PostgresStore) ExecuteMigrations(target MigrationTarget) error {
)
}

// WaitForPostgresReady waits for the Postgres database to become available by
// pinging it in a retry loop with a fixed delay. This is useful in environments
// like Kubernetes where the database container may not be ready when LND
// starts. If StartupMaxRetries is zero, this function returns immediately
// without pinging.
func WaitForPostgresReady(ctx context.Context,
cfg *PostgresConfig) error {

if cfg.StartupMaxRetries <= 0 {
return nil
}

sanitizedDSN, err := replacePasswordInDSN(cfg.Dsn)
if err != nil {
return err
}

db, err := sql.Open("pgx", cfg.Dsn)
if err != nil {
return fmt.Errorf("error creating postgres connection: %w",
err)
}
defer db.Close()

for attempt := 0; attempt < cfg.StartupMaxRetries; attempt++ {
pingCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exponential back off?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentionally used a fixed delay since the primary use case is a startup race in which Postgres is ready within seconds. The retry delay is configurable if users need a different interval.

err = db.PingContext(pingCtx)
cancel()

if err == nil {
log.Infof("Postgres is ready at '%s'", sanitizedDSN)
return nil
}

log.Warnf("Failed to connect to postgres at '%s' "+
"(attempt %d/%d): %v", sanitizedDSN, attempt+1,
cfg.StartupMaxRetries, err)

select {
case <-time.After(cfg.StartupRetryDelay):
case <-ctx.Done():
return fmt.Errorf("context canceled while waiting "+
"for postgres: %w", ctx.Err())
}
}

return fmt.Errorf("failed to connect to postgres at '%s' after %d "+
"attempts: %w", sanitizedDSN, cfg.StartupMaxRetries, err)
Comment on lines +201 to +236
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-medium medium

The WaitForPostgresReady function in this section (lines 201-236) has a critical security flaw: sensitive database credentials (passwords) can be leaked into application logs and error messages. The replacePasswordInDSN function (defined at line 48) only sanitizes DSNs in URL format, failing to handle "keyword=value" formats. This can result in unsanitized passwords being logged (lines 219, 223) or included in errors (line 235). It is crucial to improve replacePasswordInDSN to robustly sanitize both DSN formats or avoid logging the full DSN. Furthermore, the ping timeout is currently hardcoded to 5 seconds (lines 214-215). For better flexibility and environmental tuning, consider making this configurable by utilizing cfg.Timeout from PostgresConfig if available, otherwise defaulting to 5 seconds.

Suggested change
sanitizedDSN, err := replacePasswordInDSN(cfg.Dsn)
if err != nil {
return err
}
db, err := sql.Open("pgx", cfg.Dsn)
if err != nil {
return fmt.Errorf("error creating postgres connection: %w",
err)
}
defer db.Close()
for attempt := 0; attempt < cfg.StartupMaxRetries; attempt++ {
pingCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
err = db.PingContext(pingCtx)
cancel()
if err == nil {
log.Infof("Postgres is ready at '%s'", sanitizedDSN)
return nil
}
log.Warnf("Failed to connect to postgres at '%s' "+
"(attempt %d/%d): %v", sanitizedDSN, attempt+1,
cfg.StartupMaxRetries, err)
select {
case <-time.After(cfg.StartupRetryDelay):
case <-ctx.Done():
return fmt.Errorf("context canceled while waiting "+
"for postgres: %w", ctx.Err())
}
}
return fmt.Errorf("failed to connect to postgres at '%s' after %d "+
"attempts: %w", sanitizedDSN, cfg.StartupMaxRetries, err)
pingTimeout := 5 * time.Second
if cfg.Timeout > 0 {
pingTimeout = cfg.Timeout
}
pingCtx, cancel := context.WithTimeout(ctx, pingTimeout)
err = db.PingContext(pingCtx)

}

// GetSchemaVersion returns the current schema version of the Postgres database.
func (s *PostgresStore) GetSchemaVersion() (int, bool, error) {
driver, err := pgx_migrate.WithInstance(s.DB, &pgx_migrate.Config{})
Expand Down
72 changes: 72 additions & 0 deletions sqldb/postgres_wait_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package sqldb

import (
"context"
"testing"
"time"

"github.com/stretchr/testify/require"
)

// TestWaitForPostgresReadySkipsWhenDisabled verifies that WaitForPostgresReady
// returns immediately when StartupMaxRetries is set to zero.
func TestWaitForPostgresReadySkipsWhenDisabled(t *testing.T) {
t.Parallel()

cfg := &PostgresConfig{
Dsn: "postgres://localhost:1/testdb",
StartupMaxRetries: 0,
StartupRetryDelay: 1 * time.Second,
}

err := WaitForPostgresReady(context.Background(), cfg)
require.NoError(t, err)
}

// TestWaitForPostgresReadyExhaustsRetries verifies that WaitForPostgresReady
// returns an error after exhausting all retry attempts against an unreachable
// endpoint.
func TestWaitForPostgresReadyExhaustsRetries(t *testing.T) {
t.Parallel()

cfg := &PostgresConfig{
Dsn: "postgres://localhost:1/testdb",
StartupMaxRetries: 3,
StartupRetryDelay: 10 * time.Millisecond,
}

err := WaitForPostgresReady(context.Background(), cfg)
require.Error(t, err)
require.Contains(t, err.Error(), "failed to connect to postgres")
require.Contains(t, err.Error(), "3 attempts")
}

// TestWaitForPostgresReadyContextCancel verifies that WaitForPostgresReady
// respects context cancellation and stops retrying early.
func TestWaitForPostgresReadyContextCancel(t *testing.T) {
t.Parallel()

cfg := &PostgresConfig{
Dsn: "postgres://localhost:1/testdb",
StartupMaxRetries: 100,
StartupRetryDelay: 1 * time.Second,
}

ctx, cancel := context.WithCancel(context.Background())

// Cancel the context after a short delay.
go func() {
time.Sleep(100 * time.Millisecond)
cancel()
}()

start := time.Now()
err := WaitForPostgresReady(ctx, cfg)
elapsed := time.Since(start)

require.Error(t, err)
require.Contains(t, err.Error(), "context canceled")

// Ensure we didn't wait for all 100 retries.
require.Less(t, elapsed, 10*time.Second)
}
Loading