@@ -45,7 +45,7 @@ const (
4545 labelComponentDeletionJob = "deletion-job"
4646
4747 // Annotations
48- annotationBackupTargetRBDImage = "fin.cybozu.io/backup-target-rbd-image"
48+ AnnotationBackupTargetRBDImage = "fin.cybozu.io/backup-target-rbd-image"
4949 annotationDiffFrom = "fin.cybozu.io/diff-from"
5050 annotationFinBackupName = "fin.cybozu.io/finbackup-name"
5151 annotationFinBackupNamespace = "fin.cybozu.io/finbackup-namespace"
@@ -68,6 +68,7 @@ const (
6868var (
6969 errNonRetryableReconcile = errors .New ("non retryable reconciliation error; " +
7070 "reconciliation must not keep going nor be retried" )
71+ errVolumeLockedByAnother = errors .New ("the volume is locked by another process" )
7172)
7273
7374// FinBackupReconciler reconciles a FinBackup object
@@ -78,6 +79,7 @@ type FinBackupReconciler struct {
7879 podImage string
7980 maxPartSize * resource.Quantity
8081 snapRepo model.RBDSnapshotRepository
82+ imageLocker model.RBDImageLocker
8183 rawImgExpansionUnitSize uint64
8284}
8385
@@ -88,6 +90,7 @@ func NewFinBackupReconciler(
8890 podImage string ,
8991 maxPartSize * resource.Quantity ,
9092 snapRepo model.RBDSnapshotRepository ,
93+ imageLocker model.RBDImageLocker ,
9194 rawImgExpansionUnitSize uint64 ,
9295) * FinBackupReconciler {
9396 return & FinBackupReconciler {
@@ -97,6 +100,7 @@ func NewFinBackupReconciler(
97100 podImage : podImage ,
98101 maxPartSize : maxPartSize ,
99102 snapRepo : snapRepo ,
103+ imageLocker : imageLocker ,
100104 rawImgExpansionUnitSize : rawImgExpansionUnitSize ,
101105 }
102106}
@@ -386,7 +390,7 @@ func (r *FinBackupReconciler) createSnapshot(ctx context.Context, backup *finv1.
386390 if annotations == nil {
387391 annotations = map [string ]string {}
388392 }
389- annotations [annotationBackupTargetRBDImage ] = rbdImage
393+ annotations [AnnotationBackupTargetRBDImage ] = rbdImage
390394 annotations [annotationRBDPool ] = rbdPool
391395 backup .SetAnnotations (annotations )
392396
@@ -396,8 +400,13 @@ func (r *FinBackupReconciler) createSnapshot(ctx context.Context, backup *finv1.
396400 return ctrl.Result {}, err
397401 }
398402
399- snap , err := r .createSnapshotIfNeeded (rbdPool , rbdImage , snapshotName (backup ))
403+ snap , err := r .createSnapshotIfNeeded (rbdPool , rbdImage , snapshotName (backup ), lockID ( backup ) )
400404 if err != nil {
405+ if errors .Is (err , errVolumeLockedByAnother ) {
406+ logger .Info ("the volume is locked by another process" , "uid" , string (backup .GetUID ()))
407+ // FIXME: The following "requeue after" is temporary code.
408+ return ctrl.Result {RequeueAfter : 5 * time .Second }, nil
409+ }
401410 logger .Error (err , "failed to create or get snapshot" )
402411 return ctrl.Result {}, err
403412 }
@@ -571,12 +580,20 @@ func (r *FinBackupReconciler) reconcileDelete(
571580 return ctrl.Result {}, nil
572581}
573582
574- func (r * FinBackupReconciler ) createSnapshotIfNeeded (rbdPool , rbdImage , snapName string ) (* model.RBDSnapshot , error ) {
583+ func (r * FinBackupReconciler ) createSnapshotIfNeeded (rbdPool , rbdImage , snapName , lockID string ) (* model.RBDSnapshot , error ) {
575584 snap , err := findSnapshot (r .snapRepo , rbdPool , rbdImage , snapName )
576585 if err != nil {
577586 if ! errors .Is (err , model .ErrNotFound ) {
578587 return nil , fmt .Errorf ("failed to get snapshot: %w" , err )
579588 }
589+
590+ lockSuccess , err := r .lockVolume (rbdPool , rbdImage , lockID )
591+ if err != nil {
592+ return nil , fmt .Errorf ("failed to lock image: %w" , err )
593+ }
594+ if ! lockSuccess {
595+ return nil , errVolumeLockedByAnother
596+ }
580597 err = r .snapRepo .CreateSnapshot (rbdPool , rbdImage , snapName )
581598 if err != nil {
582599 return nil , fmt .Errorf ("failed to create snapshot: %w" , err )
@@ -586,6 +603,10 @@ func (r *FinBackupReconciler) createSnapshotIfNeeded(rbdPool, rbdImage, snapName
586603 return nil , fmt .Errorf ("failed to get snapshot after creation: %w" , err )
587604 }
588605 }
606+ if err := r .unlockVolume (rbdPool , rbdImage , lockID ); err != nil {
607+ return nil , fmt .Errorf ("failed to unlock image: %w" , err )
608+ }
609+
589610 return snap , nil
590611}
591612
@@ -613,6 +634,70 @@ func (r *FinBackupReconciler) removeSnapshot(ctx context.Context, backup *finv1.
613634 return nil
614635}
615636
637+ // lockVolume adds a lock to the specified RBD volume if the lock is not already held.
638+ // It returns true if the lock is held by this caller, false if another lock is held or an error occurs.
639+ func (r * FinBackupReconciler ) lockVolume (
640+ poolName , imageName , lockID string ,
641+ ) (bool , error ) {
642+ // Add a lock.
643+ if errAdd := r .imageLocker .LockAdd (poolName , imageName , lockID ); errAdd != nil {
644+ locks , errLs := r .imageLocker .LockLs (poolName , imageName )
645+ if errLs != nil {
646+ return false , fmt .Errorf ("failed to add a lock and list locks on volume %s/%s: %w" , poolName , imageName , errors .Join (errAdd , errLs ))
647+ }
648+
649+ switch len (locks ) {
650+ case 0 :
651+ // It may have been unlocked after the lock failed, but since other causes are also possible, an error is returned.
652+ return false , fmt .Errorf ("failed to add a lock to the volume %s/%s: %w" , poolName , imageName , errAdd )
653+
654+ case 1 :
655+ if locks [0 ].LockID == lockID {
656+ // Already locked by this FB.
657+ return true , nil
658+ }
659+ // Locked by another process.
660+ return false , nil
661+
662+ default :
663+ // Multiple locks found; unexpected state.
664+ return false , fmt .Errorf ("multiple locks found on volume %s/%s after failed lock attempt(%v)" , poolName , imageName , locks )
665+ }
666+ }
667+
668+ // Locked
669+ return true , nil
670+ }
671+
672+ // unlockVolume removes the specified lock from the RBD volume if the lock is held.
673+ // No action is taken if the lock is not found.
674+ func (r * FinBackupReconciler ) unlockVolume (
675+ poolName , imageName , lockID string ,
676+ ) error {
677+ // List up locks to check if the lock is held.
678+ locks , err := r .imageLocker .LockLs (poolName , imageName )
679+ if err != nil {
680+ return fmt .Errorf ("failed to list locks of the volume %s/%s: %w" , poolName , imageName , err )
681+ }
682+
683+ if len (locks ) >= 2 {
684+ return fmt .Errorf ("multiple locks found on volume %s/%s when unlocking (%v)" , poolName , imageName , locks )
685+ }
686+
687+ for _ , lock := range locks {
688+ if lock .LockID == lockID {
689+ // Unlock
690+ if err := r .imageLocker .LockRm (poolName , imageName , lock ); err != nil {
691+ return fmt .Errorf ("failed to remove the lock from the volume %s/%s: %w" , poolName , imageName , err )
692+ }
693+ return nil
694+ }
695+ }
696+
697+ // Already unlocked.
698+ return nil
699+ }
700+
616701func (r * FinBackupReconciler ) getRBDPoolAndImageFromPVC (
617702 ctx context.Context ,
618703 pvc * corev1.PersistentVolumeClaim ,
@@ -637,7 +722,7 @@ func (r *FinBackupReconciler) getRBDPoolAndImageFromPVC(
637722
638723func (r * FinBackupReconciler ) getRBDPoolAndImage (ctx context.Context , backup * finv1.FinBackup ) (string , string , error ) {
639724 rbdPool := backup .GetAnnotations ()[annotationRBDPool ]
640- rbdImage := backup .GetAnnotations ()[annotationBackupTargetRBDImage ]
725+ rbdImage := backup .GetAnnotations ()[AnnotationBackupTargetRBDImage ]
641726 if rbdPool != "" && rbdImage != "" {
642727 return rbdPool , rbdImage , nil
643728 }
@@ -733,6 +818,10 @@ func cleanupJobName(backup *finv1.FinBackup) string {
733818 return "fin-cleanup-" + string (backup .GetUID ())
734819}
735820
821+ func lockID (backup * finv1.FinBackup ) string {
822+ return string (backup .GetUID ())
823+ }
824+
736825func (r * FinBackupReconciler ) createOrUpdateBackupJob (
737826 ctx context.Context , backup * finv1.FinBackup , diffFrom string ,
738827 backupTargetPVCUID string , maxPartSize * resource.Quantity ,
@@ -794,7 +883,7 @@ func (r *FinBackupReconciler) createOrUpdateBackupJob(
794883 },
795884 {
796885 Name : "RBD_IMAGE_NAME" ,
797- Value : backup .GetAnnotations ()[annotationBackupTargetRBDImage ],
886+ Value : backup .GetAnnotations ()[AnnotationBackupTargetRBDImage ],
798887 },
799888 {
800889 Name : "BACKUP_SNAPSHOT_ID" ,
0 commit comments