From 2bb3e4443c6dae8331275c5a5efe7fd2c7e11152 Mon Sep 17 00:00:00 2001 From: Chithresh Azad Date: Thu, 9 Oct 2025 11:27:19 -0700 Subject: [PATCH] Add ai-ml nodeclasses and nodepools Description / Motivation: Need to move the manually created nodeclasses and nodepools in perflab-titan-1 cluster to KIT for reuse in future runs. Related Asana Task: https://app.asana.com/1/8442528107068/project/1209254984904634/task/1211563354393458?focus=true Desktop Testing: Tested by creating a pipeline run https://experimental.scalability.eks.aws.dev/#/namespaces/scalability/pipelineruns/chithres-titan-ai-ml-pipeline-run-v36. Once this commit is merged I will also raise a PR for the ai-ml-load Pipeline. Currently the Pipeline has my KIT fork nodepools and nodeclasses URLs. --- .../ai-ml-inference-nodepool-xlarge.yaml | 57 +++++++++++++ .../karpenter/ai-ml-monitoring-24xlarge.yaml | 64 ++++++++++++++ .../karpenter/ai-ml-operator-12xlarge.yaml | 52 ++++++++++++ .../karpenter/ai-ml-training-large.yaml | 84 +++++++++++++++++++ .../eks-perflab-ai-training-nodeclass.yaml | 71 ++++++++++++++++ .../eks-perflab-titan-class-nodeclass.yaml | 65 ++++++++++++++ tests/assets/karpenter/titan-pool.yaml | 78 +++++++++++++++++ 7 files changed, 471 insertions(+) create mode 100644 tests/assets/karpenter/ai-ml-inference-nodepool-xlarge.yaml create mode 100644 tests/assets/karpenter/ai-ml-monitoring-24xlarge.yaml create mode 100644 tests/assets/karpenter/ai-ml-operator-12xlarge.yaml create mode 100644 tests/assets/karpenter/ai-ml-training-large.yaml create mode 100644 tests/assets/karpenter/eks-perflab-ai-training-nodeclass.yaml create mode 100644 tests/assets/karpenter/eks-perflab-titan-class-nodeclass.yaml create mode 100644 tests/assets/karpenter/titan-pool.yaml diff --git a/tests/assets/karpenter/ai-ml-inference-nodepool-xlarge.yaml b/tests/assets/karpenter/ai-ml-inference-nodepool-xlarge.yaml new file mode 100644 index 00000000..a3572c54 --- /dev/null +++ b/tests/assets/karpenter/ai-ml-inference-nodepool-xlarge.yaml @@ -0,0 +1,57 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ai-ml-inference-xlarge-${AZ} +spec: + disruption: + budgets: + - nodes: 10% + consolidateAfter: 0s + consolidationPolicy: WhenEmptyOrUnderutilized + replicas: 10 + template: + metadata: + labels: + purpose: ml-xlarge + spec: + expireAfter: 720h + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: ai-training + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - ${AZ} + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: node.kubernetes.io/instance-category + operator: In + values: + - m + - r + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - xlarge + - key: karpenter.k8s.aws/instance-generation + operator: Gt + values: + - "6" + - key: node.kubernetes.io/instance-type + operator: NotIn + values: + - c7i-flex.xlarge + - c7i.xlarge + - c7a.xlarge \ No newline at end of file diff --git a/tests/assets/karpenter/ai-ml-monitoring-24xlarge.yaml b/tests/assets/karpenter/ai-ml-monitoring-24xlarge.yaml new file mode 100644 index 00000000..bd1bd358 --- /dev/null +++ b/tests/assets/karpenter/ai-ml-monitoring-24xlarge.yaml @@ -0,0 +1,64 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ai-ml-monitoring-24xlarge-${AZ} +spec: + disruption: + budgets: + - nodes: 100% + reasons: + - Empty + - nodes: 10% + reasons: + - Drifted + - Underutilized + consolidateAfter: 0s + consolidationPolicy: WhenEmpty + limits: + nodes: "26400" + replicas: 1 + template: + metadata: + labels: + purpose: ml-24xlarge + spec: + expireAfter: 720h0m0s + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: ai-training + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - ${AZ} + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - 24xlarge + - key: karpenter.k8s.aws/instance-generation + operator: Gt + values: + - "4" + taints: + - effect: NoSchedule + key: monitoring + value: "true" \ No newline at end of file diff --git a/tests/assets/karpenter/ai-ml-operator-12xlarge.yaml b/tests/assets/karpenter/ai-ml-operator-12xlarge.yaml new file mode 100644 index 00000000..df635446 --- /dev/null +++ b/tests/assets/karpenter/ai-ml-operator-12xlarge.yaml @@ -0,0 +1,52 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ai-ml-operator-12xlarge-${AZ} +spec: + disruption: + budgets: + - nodes: 0% + consolidateAfter: 0s + consolidationPolicy: WhenEmpty + replicas: 0 + template: + metadata: + labels: + purpose: ml-12xlarge + spec: + expireAfter: 720h + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: ai-training + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - ${AZ} + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: node.kubernetes.io/instance-category + operator: In + values: + - c + - m + - r + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - 12xlarge + - key: karpenter.k8s.aws/instance-generation + operator: Gt + values: + - "6" \ No newline at end of file diff --git a/tests/assets/karpenter/ai-ml-training-large.yaml b/tests/assets/karpenter/ai-ml-training-large.yaml new file mode 100644 index 00000000..76ab80ec --- /dev/null +++ b/tests/assets/karpenter/ai-ml-training-large.yaml @@ -0,0 +1,84 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ai-ml-training-large-${AZ} +spec: + disruption: + budgets: + - nodes: 100% + reasons: + - Empty + - nodes: 10% + reasons: + - Drifted + - Underutilized + consolidateAfter: 0s + consolidationPolicy: WhenEmpty + replicas: 1 + template: + metadata: + labels: + drift: drifting-test + purpose: ml-large + spec: + expireAfter: 720h0m0s + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: ai-training + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - ${AZ} + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - large + - xlarge + - 2xlarge + - 4xlarge + - key: karpenter.k8s.aws/instance-family + operator: In + values: + - c5 + - c5a + - c5n + - c6a + - c6g + - c6gd + - c6gn + - c6i + - c7g + - c7gn + - c7i + - c8g + - m5 + - m5a + - m5n + - m6a + - m6g + - m6i + - m7g + - m7gd + - m7i + - m8g + - r5 + - r5a + - r5n + - r6a + - r6g + - r6gd + - r6i + - r7g + - r7gd + - r7i + - r8g diff --git a/tests/assets/karpenter/eks-perflab-ai-training-nodeclass.yaml b/tests/assets/karpenter/eks-perflab-ai-training-nodeclass.yaml new file mode 100644 index 00000000..2e84688e --- /dev/null +++ b/tests/assets/karpenter/eks-perflab-ai-training-nodeclass.yaml @@ -0,0 +1,71 @@ +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: ai-training +spec: + amiFamily: AL2023 + amiSelectorTerms: + - alias: "al2023@${ALIAS_VERSION}" + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + deleteOnTermination: true + volumeSize: 70Gi + volumeType: gp3 + kubelet: + evictionHard: + memory.available: 5% + nodefs.available: 10% + nodefs.inodesFree: 10% + kubeReserved: + cpu: 100m + ephemeral-storage: 1Gi + memory: 100Mi + maxPods: 110 + systemReserved: + cpu: 100m + ephemeral-storage: 1Gi + memory: 100Mi + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 1 + httpTokens: required + role: KarpenterNodeRole-${CLUSTER_NAME} + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + - tags: + kubernetes.io/cluster/${CLUSTER_NAME}: owned + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: application/node.eks.aws + + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + cluster: + name: ${CLUSTER_NAME} + apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint + certificateAuthority: ${CLUSTER_CA} + cidr: "172.20.0.0/16" + kubelet: + config: + nodeStatusReportFrequency: "60m" + nodeLeaseDurationSeconds: 120 + maxPods: 110 + clusterDNS: ["172.20.0.10"] + flags: + - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool + - --register-with-taints=karpenter.sh/unregistered:NoExecute + --BOUNDARY-- \ No newline at end of file diff --git a/tests/assets/karpenter/eks-perflab-titan-class-nodeclass.yaml b/tests/assets/karpenter/eks-perflab-titan-class-nodeclass.yaml new file mode 100644 index 00000000..de900c33 --- /dev/null +++ b/tests/assets/karpenter/eks-perflab-titan-class-nodeclass.yaml @@ -0,0 +1,65 @@ +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: titan-class +spec: + amiFamily: Custom + amiSelectorTerms: + - alias: "al2023@${ALIAS_VERSION}" + instanceProfile: KarpenterNodeInstanceProfile-${CLUSTER_NAME} + kubelet: + evictionHard: + memory.available: 5% + nodefs.available: 10% + nodefs.inodesFree: 10% + kubeReserved: + cpu: 100m + ephemeral-storage: 1Gi + memory: 100Mi + maxPods: 110 + systemReserved: + cpu: 100m + ephemeral-storage: 1Gi + memory: 100Mi + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 1 + httpTokens: required + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + - tags: + kubernetes.io/cluster/${CLUSTER_NAME}: owned + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "${CLUSTER_NAME}" + - tags: + aws:cloudformation:stack-name: "${CLUSTER_NAME}" + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: application/node.eks.aws + + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + cluster: + name: ${CLUSTER_NAME} + apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint + certificateAuthority: ${CLUSTER_CA} + cidr: "172.20.0.0/16" + kubelet: + config: + nodeStatusReportFrequency: "60m" + nodeLeaseDurationSeconds: 120 + maxPods: 110 + clusterDNS: ["172.20.0.10"] + flags: + - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool + - --register-with-taints=karpenter.sh/unregistered:NoExecute + --BOUNDARY-- \ No newline at end of file diff --git a/tests/assets/karpenter/titan-pool.yaml b/tests/assets/karpenter/titan-pool.yaml new file mode 100644 index 00000000..f404923a --- /dev/null +++ b/tests/assets/karpenter/titan-pool.yaml @@ -0,0 +1,78 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: titan-pool-${AZ} +spec: + disruption: + budgets: + - nodes: 100% + reasons: + - Empty + - nodes: 10% + reasons: + - Drifted + - Underutilized + consolidateAfter: 0s + consolidationPolicy: WhenEmpty + limits: + nodes: "52800" + replicas: 10 + template: + spec: + expireAfter: 720h0m0s + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: titan-class + requirements: + - key: topology.kubernetes.io/zone + operator: In + values: + - ${AZ} + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: karpenter.k8s.aws/instance-size + operator: In + values: + - large + - xlarge + - 2xlarge + - 4xlarge + - key: karpenter.k8s.aws/instance-family + operator: In + values: + - c5 + - c5a + - c5n + - c6a + - c6g + - c6gd + - c6gn + - c6i + - c7g + - c7gn + - c7i + - c8g + - m5 + - m5a + - m5n + - m6a + - m6g + - m6i + - m7g + - m7gd + - m7i + - m8g + - r5 + - r5a + - r5n + - r6a + - r6g + - r6gd + - r6i + - r7g + - r7gd + - r7i + - r8g \ No newline at end of file