Skip to content

Commit ecaddc7

Browse files
authored
fix: make reboot test more deterministic (#693)
1 parent d899392 commit ecaddc7

File tree

1 file changed

+70
-61
lines changed

1 file changed

+70
-61
lines changed

test/cases/disruptive/graceful_reboot_test.go

Lines changed: 70 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,27 @@
33
package disruptive
44

55
import (
6-
"bytes"
76
"context"
87
"fmt"
98
"strings"
109
"testing"
1110
"time"
1211

12+
"github.com/aws/aws-k8s-tester/internal/awssdk"
1313
fwext "github.com/aws/aws-k8s-tester/internal/e2e"
1414

15-
"github.com/aws/aws-k8s-tester/internal/awssdk"
1615
"github.com/aws/aws-sdk-go-v2/service/ec2"
1716

1817
corev1 "k8s.io/api/core/v1"
19-
"k8s.io/apimachinery/pkg/api/resource"
2018
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21-
"k8s.io/client-go/util/exec"
19+
"k8s.io/client-go/kubernetes"
2220

2321
"sigs.k8s.io/e2e-framework/klient/wait"
2422
"sigs.k8s.io/e2e-framework/pkg/envconf"
2523
"sigs.k8s.io/e2e-framework/pkg/features"
2624
)
2725

28-
func getSleepPodTemplate(name string, targetNodeName string, duration string) corev1.Pod {
26+
func getSleepPodTemplate(name string) corev1.Pod {
2927
return corev1.Pod{
3028
ObjectMeta: metav1.ObjectMeta{
3129
Name: name,
@@ -36,47 +34,26 @@ func getSleepPodTemplate(name string, targetNodeName string, duration string) co
3634
{
3735
Name: name,
3836
Image: "public.ecr.aws/amazonlinux/amazonlinux:2023",
39-
Command: []string{"sleep", duration},
40-
Resources: corev1.ResourceRequirements{
41-
Requests: corev1.ResourceList{
42-
corev1.ResourceCPU: resource.MustParse("250m"),
43-
corev1.ResourceMemory: resource.MustParse("64Mi"),
44-
},
45-
Limits: corev1.ResourceList{
46-
corev1.ResourceCPU: resource.MustParse("250m"),
47-
corev1.ResourceMemory: resource.MustParse("64Mi"),
48-
},
49-
},
37+
Command: []string{"sleep", "infinity"},
5038
},
5139
},
5240
RestartPolicy: corev1.RestartPolicyNever,
53-
NodeName: targetNodeName,
54-
Resources: &corev1.ResourceRequirements{
55-
// set high pod limits to make sure the pod does not get
56-
// OOMKilled, and make requests equal to qualify the pod
57-
// for the Guaranteed Quality of Service class
58-
Requests: corev1.ResourceList{
59-
corev1.ResourceCPU: resource.MustParse("250m"),
60-
corev1.ResourceMemory: resource.MustParse("64Mi"),
61-
},
62-
Limits: corev1.ResourceList{
63-
corev1.ResourceCPU: resource.MustParse("250m"),
64-
corev1.ResourceMemory: resource.MustParse("64Mi"),
65-
},
66-
},
6741
},
6842
}
6943
}
7044

7145
func TestGracefulReboot(t *testing.T) {
7246
terminationCanaryPodName := fmt.Sprintf("termination-canary-%d", time.Now().Unix())
73-
canaryPod := getSleepPodTemplate(terminationCanaryPodName, "", "infinity")
47+
canaryPod := getSleepPodTemplate(terminationCanaryPodName)
7448
bootIndicatorPodName := fmt.Sprintf("boot-detection-%d", time.Now().Unix())
75-
bootIndicatorPod := getSleepPodTemplate(bootIndicatorPodName, "", "infinity")
49+
bootIndicatorPod := getSleepPodTemplate(bootIndicatorPodName)
7650

7751
feat := features.New("graceful-reboot").
7852
WithLabel("suite", "disruptive").
7953
Assess("Node gracefully reboots", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
54+
// Create an initial pod to allow the default scheduler to do the work of identifying a healthy node.
55+
// Starting with a healthy node is essential to the test, as the only expectation is for the node to
56+
// return to its same initial state after the reboot.
8057
if err := cfg.Client().Resources().Create(ctx, &canaryPod); err != nil {
8158
t.Fatalf("Failed to create heartbeat pod: %v", err)
8259
}
@@ -85,48 +62,84 @@ func TestGracefulReboot(t *testing.T) {
8562
wait.WithContext(ctx),
8663
wait.WithTimeout(5*time.Minute),
8764
); err != nil {
88-
t.Fatalf("Failed to wait for pod to go into running status %s: %v", terminationCanaryPodName, err)
65+
t.Fatalf("Failed to wait for pod %s to go into running status: %v", terminationCanaryPodName, err)
8966
}
9067

9168
var targetNode corev1.Node
9269
if err := cfg.Client().Resources().Get(ctx, canaryPod.Spec.NodeName, "", &targetNode); err != nil {
93-
t.Fatalf("failed to get node %s: %v", canaryPod.Spec.NodeName, err)
70+
t.Fatalf("Failed to get node %s: %v", canaryPod.Spec.NodeName, err)
71+
}
72+
73+
t.Logf("Pod %s is running on node %s", terminationCanaryPodName, targetNode.Name)
74+
75+
client, err := kubernetes.NewForConfig(cfg.Client().RESTConfig())
76+
if err != nil {
77+
t.Fatalf("Failed to initialize kubernetes client set: %v", err)
9478
}
9579

80+
// Do an initial check of the /healthz endpoint reachability to ensure we can rely on it later.
81+
// This might fail even if the node is healthy if, for example, the node's security group rules
82+
// do not allow ingress traffic from the control plane
83+
nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes").
84+
Name(targetNode.Name).SubResource("proxy").Suffix("/healthz").
85+
Do(ctx)
86+
87+
if nodeHealthResponse.Error() != nil {
88+
t.Fatalf("Unexpected error checking node %s /healthz endpoint: %v", targetNode.Name, err)
89+
}
90+
91+
t.Logf("Node %s is responding to /healthz", targetNode.Name)
92+
9693
providerIDParts := strings.Split(targetNode.Spec.ProviderID, "/")
9794
instanceID := providerIDParts[len(providerIDParts)-1]
98-
t.Logf("Node %s corresponds to EC2 instance: %s", targetNode.Name, instanceID)
95+
t.Logf("Rebooting underlying instance %s for node %s...", instanceID, targetNode.Name)
9996

10097
ec2Client := ec2.NewFromConfig(awssdk.NewConfig())
101-
102-
// TODO: make sure the exec starts before the reboot to promote better determinism
103-
t.Logf("Rebooting instance %s to test graceful reboot...", instanceID)
104-
_, err := ec2Client.RebootInstances(ctx, &ec2.RebootInstancesInput{
98+
if _, err := ec2Client.RebootInstances(ctx, &ec2.RebootInstancesInput{
10599
InstanceIds: []string{instanceID},
106-
})
107-
if err != nil {
108-
t.Fatalf("Failed to reboot EC2 instance %s: %v", instanceID, err)
100+
}); err != nil {
101+
t.Fatalf("Failed to reboot instance %s: %v", instanceID, err)
109102
}
110-
t.Logf("Successfully initiated reboot of instance %s, waiting for pod %s to terminate...", instanceID, canaryPod.Name)
111103

112-
t.Logf("Started exec into pod %s", terminationCanaryPodName)
113-
// Attempt to execute a blocking command in the pod until we get a 143, which would indicate a SIGTERM.
114-
// This a reliable way to check termination since it requires direct response from Kubelet
115-
var execOut, execErr bytes.Buffer
116-
err = cfg.Client().Resources().ExecInPod(ctx, "default", terminationCanaryPodName, terminationCanaryPodName, []string{"sleep", "infinity"}, &execOut, &execErr)
117-
if err != nil {
118-
if execErr, ok := err.(exec.CodeExitError); ok && execErr.Code == 143 {
119-
t.Logf("Pod %s was terminated", terminationCanaryPodName)
120-
} else {
121-
t.Fatalf("Got unexpected error terminating pod: %v", err)
104+
t.Logf("Successfully triggered reboot of instance %s, waiting for kubelet to become unresponsive...", instanceID)
105+
106+
kubeletShutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
107+
defer cancel()
108+
109+
// Use kubelet health probes as the signal for instance shutdown. Since the health endpoint
110+
// could previously be reached, a refused connection implies kubelet was killed.
111+
var kubeletConnectionLost bool
112+
for !kubeletConnectionLost {
113+
select {
114+
case <-kubeletShutdownCtx.Done():
115+
t.Fatalf("Failed to wait for kubelet to become unresponsive: %v", ctx.Err())
116+
case <-time.Tick(1 * time.Second):
117+
nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes").
118+
Name(targetNode.Name).SubResource("proxy").Suffix("/healthz").
119+
Do(ctx)
120+
121+
if nodeHealthResponse.Error() != nil {
122+
// TODO: match error against syscall.ECONNREFUSED instead, the k8s client doesn't
123+
// currently properly wrap the underlying error to allow this though
124+
if strings.Contains(nodeHealthResponse.Error().Error(), "connection refused") {
125+
kubeletConnectionLost = true
126+
} else {
127+
t.Fatalf("Unpexected error while monitoring kubelet on node %s: %v", targetNode.Name, nodeHealthResponse.Error())
128+
}
129+
} else {
130+
t.Logf("Node %s still responding to /healthz", targetNode.Name)
131+
}
122132
}
123133
}
124134

125-
t.Logf("Waiting up to 10 minutes for node %s to become schedulable again", targetNode.Name)
135+
t.Logf("Node %s has become unresponsive, waiting for the node to become schedulable again...", targetNode.Name)
126136

127-
// Create a second pod, under the assumption that a new pod cannot be scheduled by a shutting down kubelet
128-
// that has already evicted other pods, so this one should only schedule with a new kubelet after boot
129-
bootIndicatorPod.Spec.NodeName = targetNode.Name
137+
// Create a second pod, we will rely on this pod starting to run as an indication of a healthy state.
138+
// Since kubelet was killed at this point, we know the reboot must complete and kubelet must start
139+
// again for this pod to start running.
140+
bootIndicatorPod.Spec.NodeSelector = map[string]string{
141+
"kubernetes.io/hostname": targetNode.Name,
142+
}
130143
if err := cfg.Client().Resources().Create(ctx, &bootIndicatorPod); err != nil {
131144
t.Fatalf("Failed to create boot indicator pod: %v", err)
132145
}
@@ -144,14 +157,10 @@ func TestGracefulReboot(t *testing.T) {
144157
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
145158
if err := cfg.Client().Resources().Delete(ctx, &canaryPod); err != nil {
146159
t.Logf("Failed to delete pod %s: %v", terminationCanaryPodName, err)
147-
} else {
148-
t.Logf("Successfully cleaned up pod %s", terminationCanaryPodName)
149160
}
150161

151162
if err := cfg.Client().Resources().Delete(ctx, &bootIndicatorPod); err != nil {
152163
t.Logf("Failed to delete pod %s: %v", bootIndicatorPodName, err)
153-
} else {
154-
t.Logf("Successfully cleaned up pod %s", bootIndicatorPodName)
155164
}
156165
return ctx
157166
}).

0 commit comments

Comments
 (0)