Skip to content

Commit fbeef8a

Browse files
authored
fix: support more connection errors in reboot test (#695)
1 parent ecaddc7 commit fbeef8a

File tree

2 files changed

+49
-30
lines changed

2 files changed

+49
-30
lines changed

internal/e2e/health.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package e2e
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
8+
"k8s.io/client-go/kubernetes"
9+
"k8s.io/client-go/rest"
10+
)
11+
12+
// KubeletIsResponsive returns true if the kubelet /healthz endpoint responds with a 200 status code, and propagates
13+
// any non-connection specific errors
14+
func KubeletIsResponsive(ctx context.Context, cfg *rest.Config, nodeName string) (bool, error) {
15+
client, err := kubernetes.NewForConfig(cfg)
16+
if err != nil {
17+
return false, fmt.Errorf("failed to initialize client set: %v", err)
18+
}
19+
20+
nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes").
21+
Name(nodeName).SubResource("proxy").Suffix("/healthz").
22+
Do(ctx)
23+
24+
if nodeHealthResponse.Error() != nil {
25+
errMsg := nodeHealthResponse.Error().Error()
26+
// TODO: match errors against types, e.g. syscall.ECONNREFUSED instead, the k8s client doesn't
27+
// currently properly wrap the underlying error to allow this though
28+
if strings.Contains(errMsg, "connection refused") ||
29+
strings.Contains(errMsg, "connection reset by peer") ||
30+
strings.Contains(errMsg, "http2: client connection lost") {
31+
// these errors indicate reachability to the node in general but an unstable connection to kubelet
32+
return false, nil
33+
}
34+
35+
// propagate other errors, e.g. i/o timeout, that may result from things unrelated to kubelet health,
36+
// e.g. security group rules on the instance restricting traffic from the CP
37+
return false, fmt.Errorf("could not reach /healthz endpoint for node %s: %w", nodeName, nodeHealthResponse.Error())
38+
}
39+
40+
var statusCode int
41+
nodeHealthResponse.StatusCode(&statusCode)
42+
return statusCode == 200, nil
43+
}

test/cases/disruptive/graceful_reboot_test.go

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616

1717
corev1 "k8s.io/api/core/v1"
1818
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19-
"k8s.io/client-go/kubernetes"
2019

2120
"sigs.k8s.io/e2e-framework/klient/wait"
2221
"sigs.k8s.io/e2e-framework/pkg/envconf"
@@ -72,24 +71,14 @@ func TestGracefulReboot(t *testing.T) {
7271

7372
t.Logf("Pod %s is running on node %s", terminationCanaryPodName, targetNode.Name)
7473

75-
client, err := kubernetes.NewForConfig(cfg.Client().RESTConfig())
76-
if err != nil {
77-
t.Fatalf("Failed to initialize kubernetes client set: %v", err)
78-
}
79-
8074
// Do an initial check of the /healthz endpoint reachability to ensure we can rely on it later.
8175
// This might fail even if the node is healthy if, for example, the node's security group rules
8276
// do not allow ingress traffic from the control plane
83-
nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes").
84-
Name(targetNode.Name).SubResource("proxy").Suffix("/healthz").
85-
Do(ctx)
86-
87-
if nodeHealthResponse.Error() != nil {
88-
t.Fatalf("Unexpected error checking node %s /healthz endpoint: %v", targetNode.Name, err)
77+
kubeletResponsive, err := fwext.KubeletIsResponsive(ctx, cfg.Client().RESTConfig(), targetNode.Name)
78+
if err != nil || !kubeletResponsive {
79+
t.Fatalf("Node %s is not responding to initial /healthz checks: %v", targetNode.Name, err)
8980
}
9081

91-
t.Logf("Node %s is responding to /healthz", targetNode.Name)
92-
9382
providerIDParts := strings.Split(targetNode.Spec.ProviderID, "/")
9483
instanceID := providerIDParts[len(providerIDParts)-1]
9584
t.Logf("Rebooting underlying instance %s for node %s...", instanceID, targetNode.Name)
@@ -108,26 +97,13 @@ func TestGracefulReboot(t *testing.T) {
10897

10998
// Use kubelet health probes as the signal for instance shutdown. Since the health endpoint
11099
// could previously be reached, a refused connection implies kubelet was killed.
111-
var kubeletConnectionLost bool
112-
for !kubeletConnectionLost {
100+
for kubeletResponsive {
113101
select {
114102
case <-kubeletShutdownCtx.Done():
115103
t.Fatalf("Failed to wait for kubelet to become unresponsive: %v", ctx.Err())
116104
case <-time.Tick(1 * time.Second):
117-
nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes").
118-
Name(targetNode.Name).SubResource("proxy").Suffix("/healthz").
119-
Do(ctx)
120-
121-
if nodeHealthResponse.Error() != nil {
122-
// TODO: match error against syscall.ECONNREFUSED instead, the k8s client doesn't
123-
// currently properly wrap the underlying error to allow this though
124-
if strings.Contains(nodeHealthResponse.Error().Error(), "connection refused") {
125-
kubeletConnectionLost = true
126-
} else {
127-
t.Fatalf("Unpexected error while monitoring kubelet on node %s: %v", targetNode.Name, nodeHealthResponse.Error())
128-
}
129-
} else {
130-
t.Logf("Node %s still responding to /healthz", targetNode.Name)
105+
if kubeletResponsive, err = fwext.KubeletIsResponsive(ctx, cfg.Client().RESTConfig(), targetNode.Name); err != nil {
106+
t.Fatalf("Unpexected error while monitoring kubelet on node %s: %v", targetNode.Name, err)
131107
}
132108
}
133109
}

0 commit comments

Comments
 (0)