From 72ce8b98831ff187ac858c87bd7f3c46e59f2745 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Mon, 24 Mar 2025 12:24:21 +0000 Subject: [PATCH 01/14] add neuron plugin to the test bed --- tests/assets/neuron/config.yaml | 65 +++++ tests/assets/neuron/pod.yaml | 19 ++ .../eks/awscli-cl2-load-with-neuron.yaml | 245 ++++++++++++++++++ .../generators/clusterloader/load-neuron.yaml | 43 +++ .../tasks/setup/eks/awscli-neuron.yaml | 97 +++++++ 5 files changed, 469 insertions(+) create mode 100644 tests/assets/neuron/config.yaml create mode 100644 tests/assets/neuron/pod.yaml create mode 100644 tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml create mode 100644 tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml create mode 100644 tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml diff --git a/tests/assets/neuron/config.yaml b/tests/assets/neuron/config.yaml new file mode 100644 index 00000000..db43b0a8 --- /dev/null +++ b/tests/assets/neuron/config.yaml @@ -0,0 +1,65 @@ +{{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 500}} +{{$neuronResourcesPerPod := DefaultParam .CL2_NEURON_RESOURCES_PER_POD 64}} +{{$neuronPods := DefaultParam .CL2_NEURON_PODS .Nodes}} + +name: neuron-workers +namespace: + number: 1 +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$uniformQps}} + +steps: +- name: Start measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = neuron-worker + threshold: 20s +- name: Create pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: {{$neuronPods}} + tuningSet: UniformQPS + objectBundle: + - basename: neuron-worker + objectTemplatePath: pod.yaml + templateFillMap: + Group: neuron-worker + NeuronResources: {{$neuronResourcesPerPod}} + +- name: Wait for pods to be running + measurements: + - Identifier: WaitForRunningPods + Method: WaitForRunningPods + Params: + action: gather + desiredPodCount: {{$neuronPods}} + labelSelector: group = neuron-worker + timeout: 5m + +- name: Measure pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + +- name: Delete pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: neuron-worker + objectTemplatePath: pod.yaml + templateFillMap: + Group: neuron-worker + NeuronResources: {{$neuronResourcesPerPod}} \ No newline at end of file diff --git a/tests/assets/neuron/pod.yaml b/tests/assets/neuron/pod.yaml new file mode 100644 index 00000000..a27a9525 --- /dev/null +++ b/tests/assets/neuron/pod.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: v1 +kind: Pod +metadata: + generateName: neuron-worker- + labels: + group: {{.Group}} +spec: + containers: + - name: main + image: public.ecr.aws/amazonlinux/amazonlinux:2023 + command: + - "sleep" + - "infinity" + resources: + requests: + aws.amazon.com/neuron: "{{.NeuronResources}}" + limits: + aws.amazon.com/neuron: "{{.NeuronResources}}" \ No newline at end of file diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml new file mode 100644 index 00000000..1834f1b2 --- /dev/null +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml @@ -0,0 +1,245 @@ +apiVersion: tekton.dev/v1 +kind: Pipeline +metadata: + name: awscli-eks-neuron-load-test + namespace: scalability +spec: + finally: + - name: teardown + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + - name: slack-hook + value: $(params.slack-hook) + - name: slack-message + value: $(params.slack-message) job completed + - name: service-role-stack-name + value: $(params.cluster-name)-service-role + - name: node-role-stack-name + value: $(params.cluster-name)-node-role + - name: launch-template-stack-name + value: $(params.cluster-name)-launch-template + retries: 2 + taskRef: + kind: Task + name: awscli-eks-cluster-teardown + params: + - name: cluster-name + type: string + - name: endpoint + type: string + - name: desired-nodes + type: string + - name: pods-per-node + type: string + - name: nodes-per-namespace + type: string + - name: instance-types + description: "Instance types for Neuron emulation" + type: string + - name: results-bucket + type: string + - default: "" + name: slack-hook + type: string + - name: slack-message + type: string + - name: amp-workspace-id + type: string + - name: vpc-cfn-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/amazon-eks-vpc.json" + type: string + - name: ng-cfn-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_group_launch_template.json" + type: string + - name: kubernetes-version + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_service_role.json + name: service-role-cfn-url + type: string + - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_role.json + name: node-role-cfn-url + type: string + - name: cl2-load-test-throughput + type: string + - name: cl2-neuron-pods + description: "Number of Neuron pods to create" + type: string + - name: neuron-test-config-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" + - name: neuron-test-pod-spec-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" + tasks: + - name: slack-notification + params: + - name: slack-hook + value: $(params.slack-hook) + - name: slack-message + value: $(params.slack-message) job kicked off + taskRef: + kind: Task + name: slack-notification + - name: create-cluster-service-role + params: + - name: stack-name + value: $(params.cluster-name)-service-role + - name: role-cfn-url + value: $(params.service-role-cfn-url) + - name: role-name + value: $(params.cluster-name)-service-role + runAfter: + - slack-notification + taskRef: + kind: Task + name: awscli-role-create + - name: awscli-vpc-create + params: + - name: stack-name + value: $(params.cluster-name) + - name: vpc-cfn-url + value: $(params.vpc-cfn-url) + taskRef: + kind: Task + name: awscli-vpc-create + - name: create-cluster-node-role + params: + - name: stack-name + value: $(params.cluster-name)-node-role + - name: role-cfn-url + value: $(params.node-role-cfn-url) + - name: role-name + value: $(params.cluster-name)-node-role + runAfter: + - slack-notification + taskRef: + kind: Task + name: awscli-role-create + - name: create-eks-cluster + params: + - name: cluster-name + value: $(params.cluster-name) + - name: service-role-name + value: $(params.cluster-name)-service-role + - name: endpoint + value: $(params.endpoint) + - name: vpc-stack-name + value: $(params.cluster-name) + - name: kubernetes-version + value: $(params.kubernetes-version) + retries: 3 + runAfter: + - create-cluster-node-role + - create-cluster-service-role + - awscli-vpc-create + taskRef: + kind: Task + name: awscli-eks-cluster-create-with-vpc-stack + workspaces: + - name: config + workspace: config + - name: create-launch-template + params: + - name: cluster-name + value: $(params.cluster-name) + - name: stack-name + value: $(params.cluster-name)-launch-template + - name: kubernetes-version + value: $(params.kubernetes-version) + - name: ng-cfn-url + value: $(params.ng-cfn-url) + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-eks-cluster + taskRef: + kind: Task + name: awscli-eks-cfn-launch-template + workspaces: + - name: config + workspace: config + - name: create-mng-monitoring-nodes + params: + - name: cluster-name + value: $(params.cluster-name) + - name: host-cluster-node-role-name + value: $(params.cluster-name)-node-role + - name: endpoint + value: $(params.endpoint) + - name: desired-nodes + value: "1" + - name: max-nodes + value: "1" + - name: host-instance-types + value: "m5.12xlarge m5.16xlarge r5.12xlarge r5.16xlarge c5.12xlarge c5.18xlarge" + - name: host-taints + value: key=monitoring,value=true,effect=NO_SCHEDULE + - name: nodegroup-prefix + value: monitoring- + runAfter: + - create-launch-template + taskRef: + kind: Task + name: awscli-eks-nodegroup-create + workspaces: + - name: config + workspace: config + - name: create-mng-nodes + params: + - name: cluster-name + value: $(params.cluster-name) + - name: desired-nodes + value: $(params.desired-nodes) + - name: host-cluster-node-role-name + value: $(params.cluster-name)-node-role + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-mng-monitoring-nodes + taskRef: + kind: Task + name: awscli-eks-nodegroup-create + workspaces: + - name: config + workspace: config + - name: install-neuron-device-plugin + params: + - name: cluster-name + value: $(params.cluster-name) + - name: instance-types + value: $(params.instance-types) + runAfter: [create-mng-nodes] + taskRef: + kind: Task + name: install-neuron-device-plugin + workspaces: + - name: config + workspace: config + - name: neuron-load + params: + - name: cluster-name + value: $(params.cluster-name) + - name: results-bucket + value: $(params.results-bucket) + - name: nodes + value: $(params.desired-nodes) + - name: cl2-neuron-pods + value: $(params.cl2-neuron-pods) + - name: amp-workspace-id + value: $(params.amp-workspace-id) + runAfter: [install-neuron-device-plugin] + taskRef: + kind: Task + name: load-neuron + workspaces: + - name: source + workspace: source + - name: results + workspace: results + - name: config + workspace: config + workspaces: + - name: source + - name: results + - name: config \ No newline at end of file diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml new file mode 100644 index 00000000..96f654ae --- /dev/null +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml @@ -0,0 +1,43 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: load-neuron + namespace: scalability +spec: + description: "Run Neuron load test using clusterloader2" + params: + - name: cl2-neuron-pods + description: "Number of pods for Neuron test" + default: "100" + - name: cl2-uniform-qps + default: "500" + - name: results-bucket + - name: cluster-name + - name: amp-workspace-id + workspaces: + - name: source + - name: results + - name: config + steps: + - name: prepare-test + image: golang:1.22 + script: | + # Create test configuration + mkdir -p $(workspaces.source.path)/testing/neuron + cat > $(workspaces.source.path)/testing/neuron/config.yaml < $(workspaces.source.path)/testing/neuron/pod.yaml < /dev/null 2>&1; then + echo "Creating neuron namespace..." + kubectl create namespace neuron + else + echo "neuron namespace already exists, skipping creation..." + fi + kubectl get ns + - name: install-helm + image: alpine/k8s:1.23.7 + script: | + # Install required dependencies + apk add --no-cache openssl curl bash + + # Install Helm + curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash + - name: install-neuron + image: alpine/k8s:1.23.7 + script: | + # Update kubeconfig with token + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.region) + + # TOKEN=$(aws eks $ENDPOINT_FLAG get-token --cluster-name $(params.cluster-name) --region $(params.region) --query 'status.token' --output text) + # echo $TOKEN + # kubectl config set-credentials aws --token="${TOKEN}" + + echo "Verifying cluster access..." + kubectl get nodes + + # Create values file for Helm + cat << EOF > /tmp/values.yaml + devicePlugin: + env: + - name: KUBECONFIG + value: /etc/kubernetes/kubelet.conf + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NEURON_DEVICE_PLUGIN_EMULATION_MODE + value: "64" + scheduler: + enabled: true + env: + - name: PORT + value: "12345" + - name: NEURON_SCHEDULER_EMULATION_MODE + value: "64" + neuronInstances: + - m6i.4xlarge + npd: + enabled: false + EOF + + # Install Neuron using values file + helm upgrade --install \ + neuron \ + oci://public.ecr.aws/neuron/neuron-helm-chart \ + --namespace neuron \ + -f /tmp/values.yaml + + echo "Verifying Installation..." + kubectl get all -A -o wide | grep neuron \ No newline at end of file From 92d1d254d40f2a8a4f0c7dbe99c68528d05ddd30 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Wed, 26 Mar 2025 04:02:07 +0000 Subject: [PATCH 02/14] updated installation tasks and load-test tasks --- .../load-neuron-device-plugin.yaml | 197 ++++++++++++++++++ .../generators/clusterloader/load-neuron.yaml | 43 ---- .../tasks/setup/eks/awscli-neuron.yaml | 58 ++++-- 3 files changed, 235 insertions(+), 63 deletions(-) create mode 100644 tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml delete mode 100644 tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml new file mode 100644 index 00000000..690cbd34 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml @@ -0,0 +1,197 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: load-neuron-device-plugin + namespace: scalability +spec: + description: "Run Neuron device plugin load test using clusterloader2" + params: + - name: giturl + description: "git url to clone the package" + default: https://github.com/kubernetes/perf-tests.git + - name: cl2-branch + description: "The branch of clusterloader2 you want to use" + default: "master" + - name: cl2-neuron-pods + description: "Number of pods to create during test. Defaults to number of nodes if not specified." + default: "" + - name: cl2-uniform-qps + description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." + default: "" + - name: cl2-neuron-resources-per-pod + description: "Neuron device units requested per pod. Defaults to 64 units." + default: "" + - name: neuron-config-url + description: "URL for the Neuron test configuration file for loadtest" + default: "" + - name: neuron-pod-url + description: "URL for the Neuron pod specification file for loadtest" + default: "" + - name: nodes + description: "number of dataplane nodes to run the load test against" + default: "12" + - name: results-bucket + description: "S3 bucket for results" + - name: cluster-name + description: "The name of the EKS cluster" + - name: region + default: "us-west-2" + - name: amp-workspace-id + default: "" + results: + - name: datapoint + description: Stores the CL2 result that can be consumed by other tasks + - name: s3_result + description: Stores the S3 result path after compute + - name: pod_startup_p50 + description: 50th percentile pod startup latency + - name: pod_startup_p90 + description: 90th percentile pod startup latency + - name: pod_startup_p99 + description: 99th percentile pod startup latency + workspaces: + - name: source + mountPath: /src/k8s.io/ + - name: results + - name: config + mountPath: /config/ + stepTemplate: + env: + - name: KUBECONFIG + value: /config/kubeconfig + steps: + - name: git-clone + image: alpine/git + workingDir: $(workspaces.source.path) + script: | + # Remove existing directory if it exists + rm -rf perf-tests + + # Clone fresh copy + git clone $(params.giturl) + cd $(workspaces.source.path)/perf-tests/ + git fetch origin --verbose --tags + git checkout $(params.cl2-branch) + git branch + + - name: prepare-test + image: golang:1.24 + workingDir: $(workspaces.source.path) + script: | + S3_RESULT_PATH=$(params.results-bucket) + echo $S3_RESULT_PATH > $(results.s3_result.path) + echo "S3 Path: $S3_RESULT_PATH" + + echo "# Override configurations" > "$(workspaces.source.path)/overrides.yaml" + if [ -n "$(params.cl2-neuron-pods)" ]; then + echo "CL2_NEURON_PODS: $(params.cl2-neuron-pods)" >> "$(workspaces.source.path)/overrides.yaml" + fi + + if [ -n "$(params.cl2-uniform-qps)" ]; then + echo "CL2_UNIFORM_QPS: $(params.cl2-uniform-qps)" >> "$(workspaces.source.path)/overrides.yaml" + fi + + if [ -n "$(params.cl2-neuron-resources-per-pod)" ]; then + echo "CL2_NEURON_RESOURCES_PER_POD: $(params.cl2-neuron-resources-per-pod)" >> "$(workspaces.source.path)/overrides.yaml" + fi + + echo "Generated overrides.yaml:" + cat $(workspaces.source.path)/overrides.yaml + cp $(workspaces.source.path)/overrides.yaml $(workspaces.results.path)/overrides.yaml + + # Create test directory + mkdir -p $(workspaces.source.path)/perf-tests/clusterloader2/testing/neuron + + # Download test configurations + curl -s $(params.neuron-config-url) \ + -o $(workspaces.source.path)/perf-tests/clusterloader2/testing/neuron/config.yaml + curl -s $(params.neuron-pod-url) \ + -o $(workspaces.source.path)/perf-tests/clusterloader2/testing/neuron/pod.yaml + + # Building clusterloader2 binary + cd $(workspaces.source.path)/perf-tests/clusterloader2/ + GOOS=linux CGO_ENABLED=0 go build -v -o ./clusterloader ./cmd + + - name: run-test + image: alpine/k8s:1.30.2 + script: | + echo "Starting run-test step" + + cd $(workspaces.source.path)/perf-tests/clusterloader2/ + + echo "Checking for clusterloader binary" + if [ ! -f "./clusterloader" ]; then + echo "Error: clusterloader binary not found in $(pwd)" + echo "Listing workspace root:" + ls -la $(workspaces.source.path) + echo "Listing perf-tests directory:" + ls -la $(workspaces.source.path)/perf-tests + exit 1 + fi + + chmod +x ./clusterloader + + echo "Verifying test configuration files" + echo "Content of testing/neuron/config.yaml:" + cat testing/neuron/config.yaml + echo "Content of testing/neuron/pod.yaml:" + cat testing/neuron/pod.yaml + + echo "Starting clusterloader test" + ENABLE_EXEC_SERVICE=false ./clusterloader \ + --testconfig=testing/neuron/config.yaml \ + --testoverrides=$(workspaces.source.path)/overrides.yaml \ + --nodes=$(params.nodes) \ + --provider=eks \ + --kubeconfig=${KUBECONFIG} \ + --report-dir=$(workspaces.results.path) \ + --alsologtostderr \ + --v=2 + + exit_code=$? + echo "Test completed with exit code: $exit_code" + + if [ $exit_code -eq 0 ]; then + echo "Test succeeded" + echo "1" | tee $(results.datapoint.path) + else + echo "Test failed" + echo "0" | tee $(results.datapoint.path) + fi + + exit $exit_code + timeout: 30000s + + - name: process-metrics + image: alpine + workingDir: $(workspaces.results.path) + script: | + apk add --no-cache jq + + # find the pod startup metrics in the JSON + POD_STARTUP_METRICS=$(jq '.dataItems[] | select(.labels.Metric == "pod_startup") | .data' PodStartupLatency_*.json) + + if [ -z "$POD_STARTUP_METRICS" ]; then + echo "Error: Could not find pod_startup metrics" + exit 1 + fi + + # get the pod startup p50, p90 and p99 + echo "$POD_STARTUP_METRICS" | jq -r '.Perc50' > $(results.pod_startup_p50.path) + echo "$POD_STARTUP_METRICS" | jq -r '.Perc90' > $(results.pod_startup_p90.path) + echo "$POD_STARTUP_METRICS" | jq -r '.Perc99' > $(results.pod_startup_p99.path) + + echo "Extracted metrics:" + echo "P50: $(cat $(results.pod_startup_p50.path))" + echo "P90: $(cat $(results.pod_startup_p90.path))" + echo "P99: $(cat $(results.pod_startup_p99.path))" + + - name: upload-results + image: amazon/aws-cli + workingDir: $(workspaces.results.path) + script: | + S3_RESULT_PATH=$(cat $(results.s3_result.path)) + echo "S3 Path: $S3_RESULT_PATH" + aws sts get-caller-identity + ls -larth + aws s3 cp . s3://$S3_RESULT_PATH/ --recursive \ No newline at end of file diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml deleted file mode 100644 index 96f654ae..00000000 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: tekton.dev/v1beta1 -kind: Task -metadata: - name: load-neuron - namespace: scalability -spec: - description: "Run Neuron load test using clusterloader2" - params: - - name: cl2-neuron-pods - description: "Number of pods for Neuron test" - default: "100" - - name: cl2-uniform-qps - default: "500" - - name: results-bucket - - name: cluster-name - - name: amp-workspace-id - workspaces: - - name: source - - name: results - - name: config - steps: - - name: prepare-test - image: golang:1.22 - script: | - # Create test configuration - mkdir -p $(workspaces.source.path)/testing/neuron - cat > $(workspaces.source.path)/testing/neuron/config.yaml < $(workspaces.source.path)/testing/neuron/pod.yaml < /dev/null 2>&1; then - echo "Creating neuron namespace..." - kubectl create namespace neuron - else - echo "neuron namespace already exists, skipping creation..." - fi - kubectl get ns + + aws --version + aws sts get-caller-identity + + echo "Available nodes in the cluster:" + # List nodegroups in the cluster + aws eks $ENDPOINT_FLAG list-nodegroups --cluster-name $(params.cluster-name) --region $(params.region) + - name: install-helm image: alpine/k8s:1.23.7 script: | @@ -53,13 +58,18 @@ spec: ENDPOINT_FLAG="--endpoint $(params.endpoint)" fi aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.region) - - # TOKEN=$(aws eks $ENDPOINT_FLAG get-token --cluster-name $(params.cluster-name) --region $(params.region) --query 'status.token' --output text) - # echo $TOKEN - # kubectl config set-credentials aws --token="${TOKEN}" - echo "Verifying cluster access..." + echo "Verifying access to the cluster..." kubectl get nodes + + # Determine instance types + if [ -n "$(params.instance-types)" ]; then + INSTANCE_TYPES="$(params.instance-types)" + echo "Will install Neuron device plugin in emulation mode for specified instance types: $INSTANCE_TYPES" + else + INSTANCE_TYPES=$(kubectl get nodes -o jsonpath='{.items[*].metadata.labels.node\.kubernetes\.io/instance-type}' | tr ' ' '\n' | sort -u | tr '\n' ',') + echo "Auto-detected instance types from cluster: $INSTANCE_TYPES" + fi # Create values file for Helm cat << EOF > /tmp/values.yaml @@ -73,6 +83,12 @@ spec: fieldPath: spec.nodeName - name: NEURON_DEVICE_PLUGIN_EMULATION_MODE value: "64" + nodeSelector: null + tolerations: + - operator: Exists + effect: NoSchedule + updateStrategy: + type: RollingUpdate scheduler: enabled: true env: @@ -80,18 +96,20 @@ spec: value: "12345" - name: NEURON_SCHEDULER_EMULATION_MODE value: "64" - neuronInstances: - - m6i.4xlarge + neuronInstances: [${INSTANCE_TYPES}] npd: enabled: false EOF + echo "Using this values.yaml:" + cat /tmp/values.yaml + # Install Neuron using values file helm upgrade --install \ neuron \ oci://public.ecr.aws/neuron/neuron-helm-chart \ - --namespace neuron \ + --namespace kube-system \ -f /tmp/values.yaml echo "Verifying Installation..." - kubectl get all -A -o wide | grep neuron \ No newline at end of file + kubectl get daemonsets -n kube-system | grep neuron \ No newline at end of file From f85a86a1104e7af42755c1ce594ec96d110bab33 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Wed, 26 Mar 2025 04:34:33 +0000 Subject: [PATCH 03/14] integrated neuron installation plugins with the main pipeline --- .../eks/awscli-cl2-load-with-addons-slos.yaml | 96 +++++++ .../eks/awscli-cl2-load-with-neuron.yaml | 245 ------------------ 2 files changed, 96 insertions(+), 245 deletions(-) delete mode 100644 tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index 9a2d25e2..5bf88018 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -88,6 +88,15 @@ spec: default: "20m" - name: timeout-pia-pod-startup default: "5m" + - name: neuron-test-config-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" + - name: neuron-test-pod-spec-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" + - name: cl2-load-test-throughput + default: string + - name: cl2-neuron-pods + description: "Number of Neuron pods to create" + type: string tasks: - name: slack-notification params: @@ -220,6 +229,19 @@ spec: workspaces: - name: config workspace: config + - name: install-neuron-device-plugin + params: + - name: cluster-name + value: $(params.cluster-name) + - name: instance-types + value: $(params.instance-types) + runAfter: create-mng-nodes + taskRef: + kind: Task + name: install-neuron-device-plugin + workspaces: + - name: config + workspace: config - name: create-pod-identity-association params: - name: cluster-name @@ -310,6 +332,29 @@ spec: workspace: results - name: config workspace: config + - name: generate-neuron-load + params: + - name: cluster-name + value: $(params.cluster-name) + - name: results-bucket + value: $(params.results-bucket) + - name: nodes + value: $(params.desired-nodes) + - name: cl2-neuron-pods + value: $(params.cl2-neuron-pods) + - name: amp-workspace-id + value: $(params.amp-workspace-id) + runAfter: install-neuron-device-plugin + taskRef: + kind: Task + name: load-neuron-device-plugin + workspaces: + - name: source + workspace: source + - name: results + workspace: results + - name: config + workspace: config - name: cw-metrics-eks-pod-identity params: - name: dimensions @@ -336,6 +381,57 @@ spec: taskRef: kind: Task name: cloudwatch + - name: cw-metrics-neuron-device-plugin-latency-p50 + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.pod_startup_p50) + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + - name: metric-name + value: pod_startup_latency_p50 + - name: unit + value: Milliseconds + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch + - name: cw-metrics-neuron-device-plugin-latency-p90 + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.pod_startup_p90) + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + - name: metric-name + value: pod_startup_latency_p90 + - name: unit + value: Milliseconds + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch + - name: cw-metrics-neuron-device-plugin-latency-p99 + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.pod_startup_p99) + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + - name: metric-name + value: pod_startup_latency_p99 + - name: unit + value: Milliseconds + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch workspaces: - name: source - name: results diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml deleted file mode 100644 index 1834f1b2..00000000 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-neuron.yaml +++ /dev/null @@ -1,245 +0,0 @@ -apiVersion: tekton.dev/v1 -kind: Pipeline -metadata: - name: awscli-eks-neuron-load-test - namespace: scalability -spec: - finally: - - name: teardown - params: - - name: cluster-name - value: $(params.cluster-name) - - name: endpoint - value: $(params.endpoint) - - name: slack-hook - value: $(params.slack-hook) - - name: slack-message - value: $(params.slack-message) job completed - - name: service-role-stack-name - value: $(params.cluster-name)-service-role - - name: node-role-stack-name - value: $(params.cluster-name)-node-role - - name: launch-template-stack-name - value: $(params.cluster-name)-launch-template - retries: 2 - taskRef: - kind: Task - name: awscli-eks-cluster-teardown - params: - - name: cluster-name - type: string - - name: endpoint - type: string - - name: desired-nodes - type: string - - name: pods-per-node - type: string - - name: nodes-per-namespace - type: string - - name: instance-types - description: "Instance types for Neuron emulation" - type: string - - name: results-bucket - type: string - - default: "" - name: slack-hook - type: string - - name: slack-message - type: string - - name: amp-workspace-id - type: string - - name: vpc-cfn-url - default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/amazon-eks-vpc.json" - type: string - - name: ng-cfn-url - default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_group_launch_template.json" - type: string - - name: kubernetes-version - type: string - - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_service_role.json - name: service-role-cfn-url - type: string - - default: https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/eks_node_role.json - name: node-role-cfn-url - type: string - - name: cl2-load-test-throughput - type: string - - name: cl2-neuron-pods - description: "Number of Neuron pods to create" - type: string - - name: neuron-test-config-url - default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" - - name: neuron-test-pod-spec-url - default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" - tasks: - - name: slack-notification - params: - - name: slack-hook - value: $(params.slack-hook) - - name: slack-message - value: $(params.slack-message) job kicked off - taskRef: - kind: Task - name: slack-notification - - name: create-cluster-service-role - params: - - name: stack-name - value: $(params.cluster-name)-service-role - - name: role-cfn-url - value: $(params.service-role-cfn-url) - - name: role-name - value: $(params.cluster-name)-service-role - runAfter: - - slack-notification - taskRef: - kind: Task - name: awscli-role-create - - name: awscli-vpc-create - params: - - name: stack-name - value: $(params.cluster-name) - - name: vpc-cfn-url - value: $(params.vpc-cfn-url) - taskRef: - kind: Task - name: awscli-vpc-create - - name: create-cluster-node-role - params: - - name: stack-name - value: $(params.cluster-name)-node-role - - name: role-cfn-url - value: $(params.node-role-cfn-url) - - name: role-name - value: $(params.cluster-name)-node-role - runAfter: - - slack-notification - taskRef: - kind: Task - name: awscli-role-create - - name: create-eks-cluster - params: - - name: cluster-name - value: $(params.cluster-name) - - name: service-role-name - value: $(params.cluster-name)-service-role - - name: endpoint - value: $(params.endpoint) - - name: vpc-stack-name - value: $(params.cluster-name) - - name: kubernetes-version - value: $(params.kubernetes-version) - retries: 3 - runAfter: - - create-cluster-node-role - - create-cluster-service-role - - awscli-vpc-create - taskRef: - kind: Task - name: awscli-eks-cluster-create-with-vpc-stack - workspaces: - - name: config - workspace: config - - name: create-launch-template - params: - - name: cluster-name - value: $(params.cluster-name) - - name: stack-name - value: $(params.cluster-name)-launch-template - - name: kubernetes-version - value: $(params.kubernetes-version) - - name: ng-cfn-url - value: $(params.ng-cfn-url) - - name: endpoint - value: $(params.endpoint) - runAfter: - - create-eks-cluster - taskRef: - kind: Task - name: awscli-eks-cfn-launch-template - workspaces: - - name: config - workspace: config - - name: create-mng-monitoring-nodes - params: - - name: cluster-name - value: $(params.cluster-name) - - name: host-cluster-node-role-name - value: $(params.cluster-name)-node-role - - name: endpoint - value: $(params.endpoint) - - name: desired-nodes - value: "1" - - name: max-nodes - value: "1" - - name: host-instance-types - value: "m5.12xlarge m5.16xlarge r5.12xlarge r5.16xlarge c5.12xlarge c5.18xlarge" - - name: host-taints - value: key=monitoring,value=true,effect=NO_SCHEDULE - - name: nodegroup-prefix - value: monitoring- - runAfter: - - create-launch-template - taskRef: - kind: Task - name: awscli-eks-nodegroup-create - workspaces: - - name: config - workspace: config - - name: create-mng-nodes - params: - - name: cluster-name - value: $(params.cluster-name) - - name: desired-nodes - value: $(params.desired-nodes) - - name: host-cluster-node-role-name - value: $(params.cluster-name)-node-role - - name: endpoint - value: $(params.endpoint) - runAfter: - - create-mng-monitoring-nodes - taskRef: - kind: Task - name: awscli-eks-nodegroup-create - workspaces: - - name: config - workspace: config - - name: install-neuron-device-plugin - params: - - name: cluster-name - value: $(params.cluster-name) - - name: instance-types - value: $(params.instance-types) - runAfter: [create-mng-nodes] - taskRef: - kind: Task - name: install-neuron-device-plugin - workspaces: - - name: config - workspace: config - - name: neuron-load - params: - - name: cluster-name - value: $(params.cluster-name) - - name: results-bucket - value: $(params.results-bucket) - - name: nodes - value: $(params.desired-nodes) - - name: cl2-neuron-pods - value: $(params.cl2-neuron-pods) - - name: amp-workspace-id - value: $(params.amp-workspace-id) - runAfter: [install-neuron-device-plugin] - taskRef: - kind: Task - name: load-neuron - workspaces: - - name: source - workspace: source - - name: results - workspace: results - - name: config - workspace: config - workspaces: - - name: source - - name: results - - name: config \ No newline at end of file From 65bcba2cf0bfd833fa2928e413bc5f14d8b8e01d Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Wed, 26 Mar 2025 04:51:02 +0000 Subject: [PATCH 04/14] exposed more params in the pipeline --- .../eks/awscli-cl2-load-with-addons-slos.yaml | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index 5bf88018..b8304a55 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -92,11 +92,17 @@ spec: default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" - name: neuron-test-pod-spec-url default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" - - name: cl2-load-test-throughput - default: string + - name: neuron-test-instance-types + description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." + default: "" - name: cl2-neuron-pods - description: "Number of Neuron pods to create" - type: string + description: "Number of pods to create during test. Defaults to number of nodes if not specified." + - name: cl2-neuron-uniform-qps + description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." + default: "500" + - name: cl2-neuron-resources-per-pod + description: "Neuron device units requested per pod. Defaults to 64 units." + default: "64" tasks: - name: slack-notification params: @@ -234,7 +240,7 @@ spec: - name: cluster-name value: $(params.cluster-name) - name: instance-types - value: $(params.instance-types) + value: $(params.neuron-test-instance-types) runAfter: create-mng-nodes taskRef: kind: Task @@ -342,6 +348,10 @@ spec: value: $(params.desired-nodes) - name: cl2-neuron-pods value: $(params.cl2-neuron-pods) + - name: cl2-uniform-qps + value: $(params.cl2-neuron-uniform-qps) + - name: cl2-neuron-resources-per-pod + value: $(params.cl2-neuron-resources-per-pod) - name: amp-workspace-id value: $(params.amp-workspace-id) runAfter: install-neuron-device-plugin From 55579360ff3e53384d7db3c14a13743b5b1e474e Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Fri, 11 Apr 2025 04:41:54 +0000 Subject: [PATCH 05/14] Set default cl2-neuron-pods value in pipeline definition. Emitted neuron load test result outcome --- .../eks/awscli-cl2-load-with-addons-slos.yaml | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index b8304a55..c1d5dd28 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -97,6 +97,7 @@ spec: default: "" - name: cl2-neuron-pods description: "Number of pods to create during test. Defaults to number of nodes if not specified." + default: "10" - name: cl2-neuron-uniform-qps description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." default: "500" @@ -241,7 +242,8 @@ spec: value: $(params.cluster-name) - name: instance-types value: $(params.neuron-test-instance-types) - runAfter: create-mng-nodes + runAfter: + - create-mng-nodes taskRef: kind: Task name: install-neuron-device-plugin @@ -354,7 +356,8 @@ spec: value: $(params.cl2-neuron-resources-per-pod) - name: amp-workspace-id value: $(params.amp-workspace-id) - runAfter: install-neuron-device-plugin + runAfter: + - install-neuron-device-plugin taskRef: kind: Task name: load-neuron-device-plugin @@ -442,6 +445,21 @@ spec: taskRef: kind: Task name: cloudwatch + - name: cw-metrics-neuron-load-test-outcome + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.datapoint) + - name: metric-name + value: outcome + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch workspaces: - name: source - name: results From 9c363dbf849305e0353f2e80160abfc2518b0eef Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Fri, 11 Apr 2025 07:34:56 +0000 Subject: [PATCH 06/14] reordered tasks --- .../eks/awscli-cl2-load-with-addons-slos.yaml | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index c1d5dd28..1b134c4f 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -96,8 +96,8 @@ spec: description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." default: "" - name: cl2-neuron-pods - description: "Number of pods to create during test. Defaults to number of nodes if not specified." default: "10" + description: "Number of pods to create during test. Defaults to number of nodes if not specified." - name: cl2-neuron-uniform-qps description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." default: "500" @@ -250,6 +250,34 @@ spec: workspaces: - name: config workspace: config + - name: generate-neuron-load + params: + - name: cluster-name + value: $(params.cluster-name) + - name: results-bucket + value: $(params.results-bucket) + - name: nodes + value: $(params.desired-nodes) + - name: cl2-neuron-pods + value: $(params.cl2-neuron-pods) + - name: cl2-uniform-qps + value: $(params.cl2-neuron-uniform-qps) + - name: cl2-neuron-resources-per-pod + value: $(params.cl2-neuron-resources-per-pod) + - name: amp-workspace-id + value: $(params.amp-workspace-id) + runAfter: + - install-neuron-device-plugin + taskRef: + kind: Task + name: load-neuron-device-plugin + workspaces: + - name: source + workspace: source + - name: results + workspace: results + - name: config + workspace: config - name: create-pod-identity-association params: - name: cluster-name @@ -263,7 +291,7 @@ spec: - name: pia-trust-policy-url value: $(params.pia-trust-policy-url) runAfter: - - create-mng-nodes + - generate-neuron-load taskRef: kind: Task name: awscli-eks-pia-create @@ -340,34 +368,6 @@ spec: workspace: results - name: config workspace: config - - name: generate-neuron-load - params: - - name: cluster-name - value: $(params.cluster-name) - - name: results-bucket - value: $(params.results-bucket) - - name: nodes - value: $(params.desired-nodes) - - name: cl2-neuron-pods - value: $(params.cl2-neuron-pods) - - name: cl2-uniform-qps - value: $(params.cl2-neuron-uniform-qps) - - name: cl2-neuron-resources-per-pod - value: $(params.cl2-neuron-resources-per-pod) - - name: amp-workspace-id - value: $(params.amp-workspace-id) - runAfter: - - install-neuron-device-plugin - taskRef: - kind: Task - name: load-neuron-device-plugin - workspaces: - - name: source - workspace: source - - name: results - workspace: results - - name: config - workspace: config - name: cw-metrics-eks-pod-identity params: - name: dimensions From c22ef1a655ad8619474e7a28a8402cc80c89aaba Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Fri, 11 Apr 2025 08:25:26 +0000 Subject: [PATCH 07/14] increased threshhold for pod startup --- tests/assets/neuron/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/assets/neuron/config.yaml b/tests/assets/neuron/config.yaml index db43b0a8..2fdc719d 100644 --- a/tests/assets/neuron/config.yaml +++ b/tests/assets/neuron/config.yaml @@ -18,7 +18,7 @@ steps: Params: action: start labelSelector: group = neuron-worker - threshold: 20s + threshold: 60s - name: Create pods phases: - namespaceRange: From 697fcd8f5e2c650a5e0fd0123994f40ce347cce1 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Mon, 14 Apr 2025 06:53:47 +0000 Subject: [PATCH 08/14] changed defaults for better usability --- .../pipelines/eks/awscli-cl2-load-with-addons-slos.yaml | 8 ++++++-- .../clusterloader/load-neuron-device-plugin.yaml | 2 ++ tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index 1b134c4f..0e7e9edd 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -96,10 +96,10 @@ spec: description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." default: "" - name: cl2-neuron-pods - default: "10" + default: "" description: "Number of pods to create during test. Defaults to number of nodes if not specified." - name: cl2-neuron-uniform-qps - description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." + description: "Rate at which pods are created/deleted. Defaults to 500 QPS." default: "500" - name: cl2-neuron-resources-per-pod description: "Neuron device units requested per pod. Defaults to 64 units." @@ -264,6 +264,10 @@ spec: value: $(params.cl2-neuron-uniform-qps) - name: cl2-neuron-resources-per-pod value: $(params.cl2-neuron-resources-per-pod) + - name: neuron-config-url + value: $(params.neuron-test-config-url) + - name: neuron-pod-url + value: $(params.neuron-test-pod-spec-url) - name: amp-workspace-id value: $(params.amp-workspace-id) runAfter: diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml index 690cbd34..48b60f6e 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml @@ -81,6 +81,8 @@ spec: S3_RESULT_PATH=$(params.results-bucket) echo $S3_RESULT_PATH > $(results.s3_result.path) echo "S3 Path: $S3_RESULT_PATH" + echo "$(params.neuron-config-url)" + echo "$(params.neuron-pod-url)" echo "# Override configurations" > "$(workspaces.source.path)/overrides.yaml" if [ -n "$(params.cl2-neuron-pods)" ]; then diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml index f6c3605e..6bc04e4f 100644 --- a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml +++ b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml @@ -10,7 +10,7 @@ spec: - name: cluster-name description: The name of the EKS cluster. - name: region - default: "" + default: us-west-2 description: The region where the cluster is in. - name: endpoint default: "" @@ -111,5 +111,6 @@ spec: --namespace kube-system \ -f /tmp/values.yaml + sleep 5 echo "Verifying Installation..." kubectl get daemonsets -n kube-system | grep neuron \ No newline at end of file From 0affed331ac4dc4449a27b9033465a90e95db5bd Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Fri, 18 Apr 2025 22:38:07 +0000 Subject: [PATCH 09/14] moved default values from task to pipeline --- .../pipelines/eks/awscli-cl2-load-with-addons-slos.yaml | 2 ++ tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index fffd3e91..7359950b 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -283,6 +283,8 @@ spec: value: $(params.cluster-name) - name: instance-types value: $(params.neuron-test-instance-types) + - name: endpoint + value: $(params.endpoint) runAfter: - create-mng-nodes taskRef: diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml index 6bc04e4f..ede3f70a 100644 --- a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml +++ b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml @@ -13,10 +13,8 @@ spec: default: us-west-2 description: The region where the cluster is in. - name: endpoint - default: "" - name: instance-types description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." - default: "" workspaces: - name: config mountPath: /config/ From a661b0cd4f700ebdcb8ba882670a03b75a05d282 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Tue, 22 Apr 2025 00:08:56 +0000 Subject: [PATCH 10/14] Added a verification to ensure that all the neuron-device-plugin daemonsets are ready. This ensures that the load-tests don't start prematurely and inflate the pod startup latency numbers. Removed neuron-scheduler since it is not being used --- .../tasks/setup/eks/awscli-neuron.yaml | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml index ede3f70a..a6e50e19 100644 --- a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml +++ b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml @@ -87,13 +87,6 @@ spec: effect: NoSchedule updateStrategy: type: RollingUpdate - scheduler: - enabled: true - env: - - name: PORT - value: "12345" - - name: NEURON_SCHEDULER_EMULATION_MODE - value: "64" neuronInstances: [${INSTANCE_TYPES}] npd: enabled: false @@ -111,4 +104,17 @@ spec: sleep 5 echo "Verifying Installation..." - kubectl get daemonsets -n kube-system | grep neuron \ No newline at end of file + while true; do + DESIRED=$(kubectl get ds neuron-device-plugin -n kube-system -o jsonpath='{.status.desiredNumberScheduled}') + READY=$(kubectl get ds neuron-device-plugin -n kube-system -o jsonpath='{.status.numberReady}') + + echo "Desired: $DESIRED, Ready: $READY" + + if [ "$DESIRED" == "$READY" ] && [ "$DESIRED" -gt 0 ]; then + echo "Neuron device plugin installation verified successfully" + break + else + echo "Waiting for neuron-device-plugin daemonset to be ready..." + sleep 5 + fi + done \ No newline at end of file From 1c040a12869fa5735225399f0c44742e220f44c4 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Tue, 22 Apr 2025 14:27:27 +0000 Subject: [PATCH 11/14] moved defaults to task definition and improvided pipeline defaults --- .../eks/awscli-cl2-load-with-addons-slos.yaml | 10 +--------- .../clusterloader/load-neuron-device-plugin.yaml | 11 +++++------ .../tasks/setup/eks/awscli-neuron.yaml | 1 + 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index 7359950b..a0a6fc5d 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -92,12 +92,6 @@ spec: default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" - name: neuron-test-pod-spec-url default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" - - name: neuron-test-instance-types - description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." - default: "" - - name: cl2-neuron-pods - default: "" - description: "Number of pods to create during test. Defaults to number of nodes if not specified." - name: cl2-neuron-uniform-qps description: "Rate at which pods are created/deleted. Defaults to 500 QPS." default: "500" @@ -281,8 +275,6 @@ spec: params: - name: cluster-name value: $(params.cluster-name) - - name: instance-types - value: $(params.neuron-test-instance-types) - name: endpoint value: $(params.endpoint) runAfter: @@ -302,7 +294,7 @@ spec: - name: nodes value: $(params.desired-nodes) - name: cl2-neuron-pods - value: $(params.cl2-neuron-pods) + value: $(params.desired-nodes) - name: cl2-uniform-qps value: $(params.cl2-neuron-uniform-qps) - name: cl2-neuron-resources-per-pod diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml index 48b60f6e..fd376b7c 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml @@ -13,20 +13,19 @@ spec: description: "The branch of clusterloader2 you want to use" default: "master" - name: cl2-neuron-pods - description: "Number of pods to create during test. Defaults to number of nodes if not specified." - default: "" + description: "Number of pods to create during test" - name: cl2-uniform-qps description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." - default: "" + default: "500" - name: cl2-neuron-resources-per-pod description: "Neuron device units requested per pod. Defaults to 64 units." - default: "" + default: "64" - name: neuron-config-url description: "URL for the Neuron test configuration file for loadtest" - default: "" + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" - name: neuron-pod-url description: "URL for the Neuron pod specification file for loadtest" - default: "" + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" - name: nodes description: "number of dataplane nodes to run the load test against" default: "12" diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml index a6e50e19..bf74c201 100644 --- a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml +++ b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml @@ -14,6 +14,7 @@ spec: description: The region where the cluster is in. - name: endpoint - name: instance-types + default: "" description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." workspaces: - name: config From 733fcd564fcbfd1051842b925dde494d2fd34b02 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Tue, 22 Apr 2025 16:42:52 +0000 Subject: [PATCH 12/14] removed unused param from generate-neuron-load --- .../pipelines/eks/awscli-cl2-load-with-addons-slos.yaml | 2 -- .../generators/clusterloader/load-neuron-device-plugin.yaml | 2 -- 2 files changed, 4 deletions(-) diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index a0a6fc5d..cd361914 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -303,8 +303,6 @@ spec: value: $(params.neuron-test-config-url) - name: neuron-pod-url value: $(params.neuron-test-pod-spec-url) - - name: amp-workspace-id - value: $(params.amp-workspace-id) runAfter: - install-neuron-device-plugin taskRef: diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml index fd376b7c..9e11740f 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml @@ -35,8 +35,6 @@ spec: description: "The name of the EKS cluster" - name: region default: "us-west-2" - - name: amp-workspace-id - default: "" results: - name: datapoint description: Stores the CL2 result that can be consumed by other tasks From f99d4f0553d75a8ba1c9a5fccfeec92babbd3809 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Tue, 22 Apr 2025 16:54:28 +0000 Subject: [PATCH 13/14] removed arbitrary default node value for generate-neuron-load task --- .../generators/clusterloader/load-neuron-device-plugin.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml index 9e11740f..4da5ba68 100644 --- a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml @@ -28,7 +28,6 @@ spec: default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" - name: nodes description: "number of dataplane nodes to run the load test against" - default: "12" - name: results-bucket description: "S3 bucket for results" - name: cluster-name From c830538f041e7ba3dd4739f2cfbfab511fe97809 Mon Sep 17 00:00:00 2001 From: Shiv Bhosale Date: Fri, 2 May 2025 20:48:07 +0000 Subject: [PATCH 14/14] reduced threshhold to 25s --- tests/assets/neuron/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/assets/neuron/config.yaml b/tests/assets/neuron/config.yaml index 2fdc719d..11c437e6 100644 --- a/tests/assets/neuron/config.yaml +++ b/tests/assets/neuron/config.yaml @@ -18,7 +18,7 @@ steps: Params: action: start labelSelector: group = neuron-worker - threshold: 60s + threshold: 25s - name: Create pods phases: - namespaceRange: