Skip to content

Commit ec6d9ab

Browse files
committed
Add ai-ml nodeclasses and nodepools
Description / Motivation: Need to move the manually created nodeclasses and nodepools in perflab-titan-1 cluster to KIT for reuse in future runs. Related Asana Task: https://app.asana.com/1/8442528107068/project/1209254984904634/task/1211563354393458?focus=true Desktop Testing: Tested by creating a pipeline run https://experimental.scalability.eks.aws.dev/#/namespaces/scalability/pipelineruns/chithres-titan-ai-ml-pipeline-run-v36. Once this commit is merged I will also raise a PR for the ai-ml-load Pipeline. Currently the Pipeline has my KIT fork nodepools and nodeclasses URLs.
1 parent be496f9 commit ec6d9ab

7 files changed

+479
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-inference-xlarge-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 10%
9+
consolidateAfter: 0s
10+
consolidationPolicy: WhenEmptyOrUnderutilized
11+
replicas: 10
12+
template:
13+
metadata:
14+
labels:
15+
purpose: ml-xlarge
16+
spec:
17+
expireAfter: 720h
18+
nodeClassRef:
19+
group: karpenter.k8s.aws
20+
kind: EC2NodeClass
21+
name: ai-training
22+
requirements:
23+
- key: topology.kubernetes.io/zone
24+
operator: In
25+
values:
26+
- ${AZ}
27+
- key: kubernetes.io/arch
28+
operator: In
29+
values:
30+
- amd64
31+
- key: kubernetes.io/os
32+
operator: In
33+
values:
34+
- linux
35+
- key: karpenter.sh/capacity-type
36+
operator: In
37+
values:
38+
- on-demand
39+
- key: node.kubernetes.io/instance-category
40+
operator: In
41+
values:
42+
- m
43+
- r
44+
- key: karpenter.k8s.aws/instance-size
45+
operator: In
46+
values:
47+
- xlarge
48+
- key: karpenter.k8s.aws/instance-generation
49+
operator: Gt
50+
values:
51+
- "6"
52+
- key: node.kubernetes.io/instance-type
53+
operator: NotIn
54+
values:
55+
- c7i-flex.xlarge
56+
- c7i.xlarge
57+
- c7a.xlarge
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-monitoring-24xlarge-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 100%
9+
reasons:
10+
- Empty
11+
- nodes: 10%
12+
reasons:
13+
- Drifted
14+
- Underutilized
15+
consolidateAfter: 0s
16+
consolidationPolicy: WhenEmpty
17+
limits:
18+
nodes: "26400"
19+
replicas: 1
20+
template:
21+
metadata:
22+
labels:
23+
purpose: ml-24xlarge
24+
spec:
25+
expireAfter: 720h0m0s
26+
nodeClassRef:
27+
group: karpenter.k8s.aws
28+
kind: EC2NodeClass
29+
name: ai-training
30+
requirements:
31+
- key: topology.kubernetes.io/zone
32+
operator: In
33+
values:
34+
- ${AZ}
35+
- key: kubernetes.io/arch
36+
operator: In
37+
values:
38+
- amd64
39+
- key: kubernetes.io/os
40+
operator: In
41+
values:
42+
- linux
43+
- key: karpenter.sh/capacity-type
44+
operator: In
45+
values:
46+
- on-demand
47+
- key: karpenter.k8s.aws/instance-category
48+
operator: In
49+
values:
50+
- c
51+
- m
52+
- r
53+
- key: karpenter.k8s.aws/instance-size
54+
operator: In
55+
values:
56+
- 24xlarge
57+
- key: karpenter.k8s.aws/instance-generation
58+
operator: Gt
59+
values:
60+
- "4"
61+
taints:
62+
- effect: NoSchedule
63+
key: monitoring
64+
value: "true"
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-operator-12xlarge-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 0%
9+
consolidateAfter: 0s
10+
consolidationPolicy: WhenEmpty
11+
replicas: 0
12+
template:
13+
metadata:
14+
labels:
15+
purpose: ml-12xlarge
16+
spec:
17+
expireAfter: 720h
18+
nodeClassRef:
19+
group: karpenter.k8s.aws
20+
kind: EC2NodeClass
21+
name: ai-training
22+
requirements:
23+
- key: topology.kubernetes.io/zone
24+
operator: In
25+
values:
26+
- ${AZ}
27+
- key: kubernetes.io/arch
28+
operator: In
29+
values:
30+
- amd64
31+
- key: kubernetes.io/os
32+
operator: In
33+
values:
34+
- linux
35+
- key: karpenter.sh/capacity-type
36+
operator: In
37+
values:
38+
- on-demand
39+
- key: node.kubernetes.io/instance-category
40+
operator: In
41+
values:
42+
- c
43+
- m
44+
- r
45+
- key: karpenter.k8s.aws/instance-size
46+
operator: In
47+
values:
48+
- 12xlarge
49+
- key: karpenter.k8s.aws/instance-generation
50+
operator: Gt
51+
values:
52+
- "6"
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
apiVersion: karpenter.sh/v1
2+
kind: NodePool
3+
metadata:
4+
name: ai-ml-training-large-${AZ}
5+
spec:
6+
disruption:
7+
budgets:
8+
- nodes: 100%
9+
reasons:
10+
- Empty
11+
- nodes: 10%
12+
reasons:
13+
- Drifted
14+
- Underutilized
15+
consolidateAfter: 0s
16+
consolidationPolicy: WhenEmpty
17+
replicas: 1
18+
template:
19+
metadata:
20+
labels:
21+
drift: drifting-test
22+
purpose: ml-large
23+
spec:
24+
expireAfter: 720h0m0s
25+
nodeClassRef:
26+
group: karpenter.k8s.aws
27+
kind: EC2NodeClass
28+
name: ai-training
29+
requirements:
30+
- key: topology.kubernetes.io/zone
31+
operator: In
32+
values:
33+
- ${AZ}
34+
- key: kubernetes.io/arch
35+
operator: In
36+
values:
37+
- amd64
38+
- key: kubernetes.io/os
39+
operator: In
40+
values:
41+
- linux
42+
- key: karpenter.k8s.aws/instance-size
43+
#minValues: 4
44+
operator: In
45+
values:
46+
- large
47+
- xlarge
48+
- 2xlarge
49+
- 4xlarge
50+
- key: karpenter.k8s.aws/instance-family
51+
#minValues: 33
52+
operator: In
53+
values:
54+
- c5
55+
- c5a
56+
- c5n
57+
- c6a
58+
- c6g
59+
- c6gd
60+
- c6gn
61+
- c6i
62+
- c7g
63+
- c7gn
64+
- c7i
65+
- c8g
66+
- m5
67+
- m5a
68+
- m5n
69+
- m6a
70+
- m6g
71+
- m6i
72+
- m7g
73+
- m7gd
74+
- m7i
75+
- m8g
76+
- r5
77+
- r5a
78+
- r5n
79+
- r6a
80+
- r6g
81+
- r6gd
82+
- r6i
83+
- r7g
84+
- r7gd
85+
- r7i
86+
- r8g
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
apiVersion: karpenter.k8s.aws/v1
2+
kind: EC2NodeClass
3+
metadata:
4+
name: ai-training
5+
spec:
6+
amiFamily: AL2023
7+
amiSelectorTerms:
8+
- alias: "al2023@${ALIAS_VERSION}"
9+
blockDeviceMappings:
10+
- deviceName: /dev/xvda
11+
ebs:
12+
deleteOnTermination: true
13+
volumeSize: 70Gi
14+
volumeType: gp3
15+
kubelet:
16+
evictionHard:
17+
memory.available: 5%
18+
nodefs.available: 10%
19+
nodefs.inodesFree: 10%
20+
kubeReserved:
21+
cpu: 100m
22+
ephemeral-storage: 1Gi
23+
memory: 100Mi
24+
maxPods: 110
25+
systemReserved:
26+
cpu: 100m
27+
ephemeral-storage: 1Gi
28+
memory: 100Mi
29+
metadataOptions:
30+
httpEndpoint: enabled
31+
httpProtocolIPv6: disabled
32+
httpPutResponseHopLimit: 1
33+
httpTokens: required
34+
role: KarpenterNodeRole-${CLUSTER_NAME}
35+
securityGroupSelectorTerms:
36+
- tags:
37+
karpenter.sh/discovery: "${CLUSTER_NAME}"
38+
- tags:
39+
aws:cloudformation:stack-name: "${CLUSTER_NAME}"
40+
- tags:
41+
kubernetes.io/cluster/${CLUSTER_NAME}: owned
42+
subnetSelectorTerms:
43+
- tags:
44+
karpenter.sh/discovery: "${CLUSTER_NAME}"
45+
- tags:
46+
aws:cloudformation:stack-name: "${CLUSTER_NAME}"
47+
userData: |
48+
MIME-Version: 1.0
49+
Content-Type: multipart/mixed; boundary="BOUNDARY"
50+
51+
--BOUNDARY
52+
Content-Type: application/node.eks.aws
53+
54+
apiVersion: node.eks.aws/v1alpha1
55+
kind: NodeConfig
56+
spec:
57+
cluster:
58+
name: ${CLUSTER_NAME}
59+
apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint
60+
certificateAuthority: ${CLUSTER_CA}
61+
cidr: "172.20.0.0/16"
62+
kubelet:
63+
config:
64+
nodeStatusReportFrequency: "60m"
65+
nodeLeaseDurationSeconds: 120
66+
maxPods: 110
67+
clusterDNS: ["172.20.0.10"]
68+
flags:
69+
- --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool
70+
- --register-with-taints=karpenter.sh/unregistered:NoExecute
71+
--BOUNDARY--

0 commit comments

Comments
 (0)