Skip to content

Commit b594cd4

Browse files
committed
fix: add goroutine to handle remotemachine deletion when provisioning hangs
Signed-off-by: apedriza <[email protected]>
1 parent 6f3016f commit b594cd4

File tree

5 files changed

+83
-11
lines changed

5 files changed

+83
-11
lines changed

api/infrastructure/v1beta1/remote_machine_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ type RemoteMachineList struct {
140140
// +kubebuilder:subresource:status
141141
// +kubebuilder:metadata:labels="cluster.x-k8s.io/v1beta1=v1beta1"
142142
// +kubebuilder:metadata:labels="cluster.x-k8s.io/provider=infrastructure-k0smotron"
143+
// +kubebuilder:printcolumn:name="Address",type=string,JSONPath=".spec.machine.address",description="IP address or DNS name of the remote machine"
144+
// +kubebuilder:printcolumn:name="Reserved",type=string,JSONPath=".status.reserved",description="Indicates if the machine is reserved"
145+
// +kubebuilder:printcolumn:name="Remote Machine",type=string,JSONPath=".status.machineRef.name",description="Reference to the RemoteMachine"
143146

144147
type PooledRemoteMachine struct {
145148
metav1.TypeMeta `json:",inline"`

config/clusterapi/infrastructure/bases/infrastructure.cluster.x-k8s.io_pooledremotemachines.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,20 @@ spec:
1717
singular: pooledremotemachine
1818
scope: Namespaced
1919
versions:
20-
- name: v1beta1
20+
- additionalPrinterColumns:
21+
- description: IP address or DNS name of the remote machine
22+
jsonPath: .spec.machine.address
23+
name: Address
24+
type: string
25+
- description: Indicates if the machine is reserved
26+
jsonPath: .status.reserved
27+
name: Reserved
28+
type: string
29+
- description: Reference to the RemoteMachine
30+
jsonPath: .status.machineRef.name
31+
name: Remote Machine
32+
type: string
33+
name: v1beta1
2134
schema:
2235
openAPIV3Schema:
2336
properties:

config/crd/bases/infrastructure/infrastructure.cluster.x-k8s.io_pooledremotemachines.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,20 @@ spec:
1717
singular: pooledremotemachine
1818
scope: Namespaced
1919
versions:
20-
- name: v1beta1
20+
- additionalPrinterColumns:
21+
- description: IP address or DNS name of the remote machine
22+
jsonPath: .spec.machine.address
23+
name: Address
24+
type: string
25+
- description: Indicates if the machine is reserved
26+
jsonPath: .status.reserved
27+
name: Reserved
28+
type: string
29+
- description: Reference to the RemoteMachine
30+
jsonPath: .status.machineRef.name
31+
name: Remote Machine
32+
type: string
33+
name: v1beta1
2134
schema:
2235
openAPIV3Schema:
2336
properties:

internal/controller/infrastructure/remote_machine_controller.go

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package infrastructure
1919
import (
2020
"context"
2121
"fmt"
22+
"time"
2223

2324
"gopkg.in/yaml.v3"
2425
v1 "k8s.io/api/core/v1"
@@ -33,6 +34,7 @@ import (
3334
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3435
capiutil "sigs.k8s.io/cluster-api/util"
3536
"sigs.k8s.io/cluster-api/util/annotations"
37+
"sigs.k8s.io/cluster-api/util/finalizers"
3638
"sigs.k8s.io/cluster-api/util/patch"
3739
ctrl "sigs.k8s.io/controller-runtime"
3840
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -90,6 +92,10 @@ func (r *RemoteMachineController) Reconcile(ctx context.Context, req ctrl.Reques
9092
return ctrl.Result{}, err
9193
}
9294

95+
if finalizerAdded, err := finalizers.EnsureFinalizer(ctx, r.Client, rm, RemoteMachineFinalizer); err != nil || finalizerAdded {
96+
return ctrl.Result{}, err
97+
}
98+
9399
// Fetch the Machine that ows RemoteMachine
94100
machine, err := capiutil.GetOwnerMachine(ctx, r.Client, rm.ObjectMeta)
95101
if err != nil {
@@ -246,9 +252,25 @@ func (r *RemoteMachineController) Reconcile(ctx context.Context, req ctrl.Reques
246252
return ctrl.Result{}, nil
247253
}
248254

249-
if !controllerutil.ContainsFinalizer(rm, RemoteMachineFinalizer) {
250-
controllerutil.AddFinalizer(rm, RemoteMachineFinalizer)
251-
}
255+
ctx, cancel := context.WithCancel(ctx)
256+
defer cancel()
257+
258+
// Running a goroutine to monitor if the RemoteMachine gets deleted during provisioning. This way we can delete
259+
// proceed to cleanup immediately without waiting for the provisioning to timeout. For example in scenarios where
260+
// the bootstrap process hangs and the controller needs to be able to delete the RemoteMachine. Controller-runtime
261+
// only runs one Reconcile at a time per object, so we need to monitor deletion in a separate goroutine.
262+
go func() {
263+
for {
264+
updatedRemoteMachine := &infrastructure.RemoteMachine{}
265+
if err := r.Get(ctx, client.ObjectKeyFromObject(rm), updatedRemoteMachine); err == nil &&
266+
!updatedRemoteMachine.DeletionTimestamp.IsZero() {
267+
log.Info("Cancelling Bootstrap because the underlying machine has been deleted")
268+
cancel()
269+
return
270+
}
271+
time.Sleep(5 * time.Second)
272+
}
273+
}()
252274

253275
defer func() {
254276
log.Info("Reconcile complete")
@@ -287,6 +309,10 @@ func (r *RemoteMachineController) Reconcile(ctx context.Context, req ctrl.Reques
287309
m.Labels[l] = rm.Labels[l]
288310
}
289311
}
312+
313+
if len(m.Annotations) == 0 {
314+
m.Annotations = make(map[string]string)
315+
}
290316
for k := range rm.Annotations {
291317
if _, ok := m.Annotations[k]; !ok {
292318
m.Annotations[k] = rm.Annotations[k]

internal/controller/infrastructure/ssh_provisioner.go

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,26 +104,30 @@ func (p *SSHProvisioner) Provision(ctx context.Context) error {
104104

105105
// Write files first
106106
for _, file := range p.cloudInit.Files {
107+
p.log.Info("Uploading file", "path", file.Path, "permissions", file.Permissions)
107108
if err := p.uploadFile(rigClient, file); err != nil {
108109
return fmt.Errorf("failed to upload file: %w", err)
109110
}
111+
p.log.Info("Uploaded file", "path", file.Path, "permissions", file.Permissions)
110112
}
111113

112114
if p.machine.Spec.CommandsAsScript {
113115
// Run the install script
114116
installScriptPath := filepath.Join(p.machine.Spec.WorkingDir, "k0s_install.sh")
117+
p.log.Info("running install script", "command", installScriptPath)
115118
output, err := rigClient.ExecOutput(installScriptPath)
116119
if err != nil {
117-
p.log.Error(err, "failed to run command", "output", output)
120+
p.log.Error(err, "failed to run command", "command", installScriptPath, "output", output)
118121
return fmt.Errorf("failed to run command: %w", err)
119122
}
120-
log.Info("executed command", "command", installScriptPath, "output", output)
123+
log.Info("executed install script", "command", installScriptPath, "output", output)
121124
} else {
122125
// Run commands
123126
for _, cmd := range p.cloudInit.Commands {
124-
output, err := rigClient.ExecOutput(cmd)
127+
p.log.Info("running command", "command", cmd)
128+
output, err := rigClient.ExecOutputContext(ctx, cmd)
125129
if err != nil {
126-
p.log.Error(err, "failed to run command", "output", output)
130+
p.log.Error(err, "failed to run command", "command", cmd, "output", output)
127131
return fmt.Errorf("failed to run command: %w", err)
128132
}
129133
log.Info("executed command", "command", cmd, "output", output)
@@ -189,6 +193,7 @@ func (p *SSHProvisioner) Cleanup(ctx context.Context, mode RemoteMachineMode) er
189193

190194
if mode == ModeNonK0s {
191195
// If k0s is not the bootstrap provider, we have nothing to do.
196+
p.log.Info("k0smotron is not the bootstrap provider and no cleanup commands specified, skipping cleanup")
192197
return nil
193198
}
194199

@@ -219,9 +224,21 @@ func (p *SSHProvisioner) Cleanup(ctx context.Context, mode RemoteMachineMode) er
219224

220225
p.log.Info("Cleaning up remote machine...")
221226
for _, cmd := range cmds {
222-
output, err := rigClient.ExecOutput(cmd)
227+
output, err := rigClient.ExecOutputContext(ctx, cmd)
223228
if err != nil {
224-
p.log.Error(err, "failed to run command", "output", output)
229+
// if k0s command is not installed, manually remove files added for k0s bootstrap.
230+
if strings.Contains(err.Error(), "command not found") {
231+
for _, file := range p.cloudInit.Files {
232+
p.log.Info("Removing file", "path", file.Path)
233+
err := rigClient.Sudo().FS().Remove(file.Path)
234+
if err != nil {
235+
p.log.Error(err, "failed to remove file", "path", file.Path)
236+
} else {
237+
p.log.Info("Removed file", "path", file.Path)
238+
}
239+
}
240+
}
241+
p.log.Error(err, "failed to run command", "command", cmd, "output", output)
225242
}
226243
}
227244

0 commit comments

Comments
 (0)