File tree Expand file tree Collapse file tree 7 files changed +479
-0
lines changed
Expand file tree Collapse file tree 7 files changed +479
-0
lines changed Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-inference-xlarge-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 10%
9+ consolidateAfter : 0s
10+ consolidationPolicy : WhenEmptyOrUnderutilized
11+ replicas : 10
12+ template :
13+ metadata :
14+ labels :
15+ purpose : ml-xlarge
16+ spec :
17+ expireAfter : 720h
18+ nodeClassRef :
19+ group : karpenter.k8s.aws
20+ kind : EC2NodeClass
21+ name : ai-training
22+ requirements :
23+ - key : topology.kubernetes.io/zone
24+ operator : In
25+ values :
26+ - ${AZ}
27+ - key : kubernetes.io/arch
28+ operator : In
29+ values :
30+ - amd64
31+ - key : kubernetes.io/os
32+ operator : In
33+ values :
34+ - linux
35+ - key : karpenter.sh/capacity-type
36+ operator : In
37+ values :
38+ - on-demand
39+ - key : node.kubernetes.io/instance-category
40+ operator : In
41+ values :
42+ - m
43+ - r
44+ - key : karpenter.k8s.aws/instance-size
45+ operator : In
46+ values :
47+ - xlarge
48+ - key : karpenter.k8s.aws/instance-generation
49+ operator : Gt
50+ values :
51+ - " 6"
52+ - key : node.kubernetes.io/instance-type
53+ operator : NotIn
54+ values :
55+ - c7i-flex.xlarge
56+ - c7i.xlarge
57+ - c7a.xlarge
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-monitoring-24xlarge-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 100%
9+ reasons :
10+ - Empty
11+ - nodes : 10%
12+ reasons :
13+ - Drifted
14+ - Underutilized
15+ consolidateAfter : 0s
16+ consolidationPolicy : WhenEmpty
17+ limits :
18+ nodes : " 26400"
19+ replicas : 1
20+ template :
21+ metadata :
22+ labels :
23+ purpose : ml-24xlarge
24+ spec :
25+ expireAfter : 720h0m0s
26+ nodeClassRef :
27+ group : karpenter.k8s.aws
28+ kind : EC2NodeClass
29+ name : ai-training
30+ requirements :
31+ - key : topology.kubernetes.io/zone
32+ operator : In
33+ values :
34+ - ${AZ}
35+ - key : kubernetes.io/arch
36+ operator : In
37+ values :
38+ - amd64
39+ - key : kubernetes.io/os
40+ operator : In
41+ values :
42+ - linux
43+ - key : karpenter.sh/capacity-type
44+ operator : In
45+ values :
46+ - on-demand
47+ - key : karpenter.k8s.aws/instance-category
48+ operator : In
49+ values :
50+ - c
51+ - m
52+ - r
53+ - key : karpenter.k8s.aws/instance-size
54+ operator : In
55+ values :
56+ - 24xlarge
57+ - key : karpenter.k8s.aws/instance-generation
58+ operator : Gt
59+ values :
60+ - " 4"
61+ taints :
62+ - effect : NoSchedule
63+ key : monitoring
64+ value : " true"
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-operator-12xlarge-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 0%
9+ consolidateAfter : 0s
10+ consolidationPolicy : WhenEmpty
11+ replicas : 0
12+ template :
13+ metadata :
14+ labels :
15+ purpose : ml-12xlarge
16+ spec :
17+ expireAfter : 720h
18+ nodeClassRef :
19+ group : karpenter.k8s.aws
20+ kind : EC2NodeClass
21+ name : ai-training
22+ requirements :
23+ - key : topology.kubernetes.io/zone
24+ operator : In
25+ values :
26+ - ${AZ}
27+ - key : kubernetes.io/arch
28+ operator : In
29+ values :
30+ - amd64
31+ - key : kubernetes.io/os
32+ operator : In
33+ values :
34+ - linux
35+ - key : karpenter.sh/capacity-type
36+ operator : In
37+ values :
38+ - on-demand
39+ - key : node.kubernetes.io/instance-category
40+ operator : In
41+ values :
42+ - c
43+ - m
44+ - r
45+ - key : karpenter.k8s.aws/instance-size
46+ operator : In
47+ values :
48+ - 12xlarge
49+ - key : karpenter.k8s.aws/instance-generation
50+ operator : Gt
51+ values :
52+ - " 6"
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.sh/v1
2+ kind : NodePool
3+ metadata :
4+ name : ai-ml-training-large-${AZ}
5+ spec :
6+ disruption :
7+ budgets :
8+ - nodes : 100%
9+ reasons :
10+ - Empty
11+ - nodes : 10%
12+ reasons :
13+ - Drifted
14+ - Underutilized
15+ consolidateAfter : 0s
16+ consolidationPolicy : WhenEmpty
17+ replicas : 1
18+ template :
19+ metadata :
20+ labels :
21+ drift : drifting-test
22+ purpose : ml-large
23+ spec :
24+ expireAfter : 720h0m0s
25+ nodeClassRef :
26+ group : karpenter.k8s.aws
27+ kind : EC2NodeClass
28+ name : ai-training
29+ requirements :
30+ - key : topology.kubernetes.io/zone
31+ operator : In
32+ values :
33+ - ${AZ}
34+ - key : kubernetes.io/arch
35+ operator : In
36+ values :
37+ - amd64
38+ - key : kubernetes.io/os
39+ operator : In
40+ values :
41+ - linux
42+ - key : karpenter.k8s.aws/instance-size
43+ # minValues: 4
44+ operator : In
45+ values :
46+ - large
47+ - xlarge
48+ - 2xlarge
49+ - 4xlarge
50+ - key : karpenter.k8s.aws/instance-family
51+ # minValues: 33
52+ operator : In
53+ values :
54+ - c5
55+ - c5a
56+ - c5n
57+ - c6a
58+ - c6g
59+ - c6gd
60+ - c6gn
61+ - c6i
62+ - c7g
63+ - c7gn
64+ - c7i
65+ - c8g
66+ - m5
67+ - m5a
68+ - m5n
69+ - m6a
70+ - m6g
71+ - m6i
72+ - m7g
73+ - m7gd
74+ - m7i
75+ - m8g
76+ - r5
77+ - r5a
78+ - r5n
79+ - r6a
80+ - r6g
81+ - r6gd
82+ - r6i
83+ - r7g
84+ - r7gd
85+ - r7i
86+ - r8g
Original file line number Diff line number Diff line change 1+ apiVersion : karpenter.k8s.aws/v1
2+ kind : EC2NodeClass
3+ metadata :
4+ name : ai-training
5+ spec :
6+ amiFamily : AL2023
7+ amiSelectorTerms :
8+ - alias : " al2023@${ALIAS_VERSION}"
9+ blockDeviceMappings :
10+ - deviceName : /dev/xvda
11+ ebs :
12+ deleteOnTermination : true
13+ volumeSize : 70Gi
14+ volumeType : gp3
15+ kubelet :
16+ evictionHard :
17+ memory.available : 5%
18+ nodefs.available : 10%
19+ nodefs.inodesFree : 10%
20+ kubeReserved :
21+ cpu : 100m
22+ ephemeral-storage : 1Gi
23+ memory : 100Mi
24+ maxPods : 110
25+ systemReserved :
26+ cpu : 100m
27+ ephemeral-storage : 1Gi
28+ memory : 100Mi
29+ metadataOptions :
30+ httpEndpoint : enabled
31+ httpProtocolIPv6 : disabled
32+ httpPutResponseHopLimit : 1
33+ httpTokens : required
34+ role : KarpenterNodeRole-${CLUSTER_NAME}
35+ securityGroupSelectorTerms :
36+ - tags :
37+ karpenter.sh/discovery : " ${CLUSTER_NAME}"
38+ - tags :
39+ aws:cloudformation:stack-name : " ${CLUSTER_NAME}"
40+ - tags :
41+ kubernetes.io/cluster/${CLUSTER_NAME} : owned
42+ subnetSelectorTerms :
43+ - tags :
44+ karpenter.sh/discovery : " ${CLUSTER_NAME}"
45+ - tags :
46+ aws:cloudformation:stack-name : " ${CLUSTER_NAME}"
47+ userData : |
48+ MIME-Version: 1.0
49+ Content-Type: multipart/mixed; boundary="BOUNDARY"
50+
51+ --BOUNDARY
52+ Content-Type: application/node.eks.aws
53+
54+ apiVersion: node.eks.aws/v1alpha1
55+ kind: NodeConfig
56+ spec:
57+ cluster:
58+ name: ${CLUSTER_NAME}
59+ apiServerEndpoint: ${CLUSTER_ENDPOINT} # Using the actual cluster endpoint
60+ certificateAuthority: ${CLUSTER_CA}
61+ cidr: "172.20.0.0/16"
62+ kubelet:
63+ config:
64+ nodeStatusReportFrequency: "60m"
65+ nodeLeaseDurationSeconds: 120
66+ maxPods: 110
67+ clusterDNS: ["172.20.0.10"]
68+ flags:
69+ - --node-labels=karpenter.sh/capacity-type=on-demand,karpenter.sh/nodepool=titan-pool
70+ - --register-with-taints=karpenter.sh/unregistered:NoExecute
71+ --BOUNDARY--
You can’t perform that action at this time.
0 commit comments