diff --git a/internal/deployers/eksapi/infra.go b/internal/deployers/eksapi/infra.go index fd620a33e..bc5d19bba 100644 --- a/internal/deployers/eksapi/infra.go +++ b/internal/deployers/eksapi/infra.go @@ -24,6 +24,7 @@ import ( "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" "github.com/aws/aws-k8s-tester/internal/metrics" + "github.com/aws/aws-k8s-tester/internal/util" ) const ( @@ -531,7 +532,7 @@ func (m *InfrastructureManager) createCloudWatchInfrastructureStack(clusterName StackName: out.StackId, }, infraStackCreationTimeout); err != nil { - return "", fmt.Errorf("failed to wait for CloudWatch infrastructure stack creation: %w", err) + return "", util.WrapCFNStackFailure(context.TODO(), m.clients.CFN(), fmt.Errorf("failed to wait for CloudWatch infrastructure stack creation: %w", err), stackName) } // Get the CloudWatch role ARN from stack outputs diff --git a/internal/deployers/eksapi/node.go b/internal/deployers/eksapi/node.go index 07824cf07..2f694f282 100644 --- a/internal/deployers/eksapi/node.go +++ b/internal/deployers/eksapi/node.go @@ -28,6 +28,7 @@ import ( karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" + "github.com/aws/aws-k8s-tester/internal/util" apierrors "k8s.io/apimachinery/pkg/api/errors" ) @@ -504,7 +505,7 @@ func (m *nodeManager) createUnmanagedNodegroup(infra *Infrastructure, cluster *C }, opts.NodeCreationTimeout) if err != nil { - return fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err) + return util.WrapCFNStackFailure(context.TODO(), m.clients.CFN(), fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err), stackName) } klog.Infof("created unmanaged nodegroup stack: %s", *out.StackId) if opts.ExpectedAMI != "" { diff --git a/internal/util/cloudformation.go b/internal/util/cloudformation.go new file mode 100644 index 000000000..d84a4a85c --- /dev/null +++ b/internal/util/cloudformation.go @@ -0,0 +1,49 @@ +package util + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/cloudformation" + types "github.com/aws/aws-sdk-go-v2/service/cloudformation/types" +) + +// TODO: implement AWS client wrappers, and incorporate this into the cfn:CreateStack call +func WrapCFNStackFailure(ctx context.Context, cfnClient *cloudformation.Client, createStackErr error, stackName string) error { + if createStackErr == nil { + return nil + } + resourceByFailureMode := make(map[string][]string) + eventsPaginator := cloudformation.NewDescribeStackEventsPaginator(cfnClient, &cloudformation.DescribeStackEventsInput{ + StackName: &stackName, + }) + for eventsPaginator.HasMorePages() { + page, err := eventsPaginator.NextPage(ctx) + if err != nil { + return createStackErr + } + for _, event := range page.StackEvents { + if event.ResourceStatus == types.ResourceStatusCreateFailed { + if _, ok := resourceByFailureMode[aws.ToString(event.ResourceStatusReason)]; !ok { + resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = []string{} + } + resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = append(resourceByFailureMode[aws.ToString(event.ResourceStatusReason)], aws.ToString(event.LogicalResourceId)) + } + } + } + nonCancellationFailure := len(resourceByFailureMode) > 1 + var enhancedDetails []string + for reason, resources := range resourceByFailureMode { + if nonCancellationFailure && reason == "Resource creation cancelled" { + // Ignore resource cancellation errors if there's another failure reported, those failures + // would just be a consequence of that failure. If all the failures are resource cancellation, + // then there was likely a user initiated delete of the whole stack based on a timeout + // waiting for one of the resources to create + continue + } + enhancedDetails = append(enhancedDetails, fmt.Sprintf("%s: %s", strings.Join(resources, ","), reason)) + } + return fmt.Errorf("%w: %s", createStackErr, strings.Join(enhancedDetails, "--")) +}