Skip to content

Commit 0e6bb7b

Browse files
authored
feat(fault-injection): Add Kubernetes deployment manifests (#4044)
1 parent e10319f commit 0e6bb7b

File tree

4 files changed

+546
-0
lines changed

4 files changed

+546
-0
lines changed
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
---
6+
apiVersion: v1
7+
kind: ServiceAccount
8+
metadata:
9+
name: fault-injection-api
10+
namespace: fault-injection-system
11+
12+
---
13+
apiVersion: rbac.authorization.k8s.io/v1
14+
kind: ClusterRole
15+
metadata:
16+
name: fault-injection-api
17+
rules:
18+
- apiGroups: [""]
19+
resources: ["nodes", "pods", "services"]
20+
verbs: ["get", "list", "watch", "patch"]
21+
- apiGroups: ["apps"]
22+
resources: ["deployments", "daemonsets", "statefulsets"]
23+
verbs: ["get", "list", "watch"]
24+
- apiGroups: ["networking.k8s.io"]
25+
resources: ["networkpolicies"]
26+
verbs: ["get", "list", "create", "delete"]
27+
- apiGroups: ["chaos-mesh.org"]
28+
resources: ["networkchaos", "podchaos", "stresschaos", "iochaos"]
29+
verbs: ["get", "list", "create", "delete", "watch"]
30+
31+
---
32+
apiVersion: rbac.authorization.k8s.io/v1
33+
kind: ClusterRoleBinding
34+
metadata:
35+
name: fault-injection-api
36+
roleRef:
37+
apiGroup: rbac.authorization.k8s.io
38+
kind: ClusterRole
39+
name: fault-injection-api
40+
subjects:
41+
- kind: ServiceAccount
42+
name: fault-injection-api
43+
namespace: fault-injection-system
44+
45+
---
46+
apiVersion: apps/v1
47+
kind: Deployment
48+
metadata:
49+
name: fault-injection-api
50+
namespace: fault-injection-system
51+
labels:
52+
app: fault-injection-api
53+
spec:
54+
replicas: 1
55+
selector:
56+
matchLabels:
57+
app: fault-injection-api
58+
template:
59+
metadata:
60+
labels:
61+
app: fault-injection-api
62+
spec:
63+
serviceAccountName: fault-injection-api
64+
# Use host network to communicate with hostNetwork agents
65+
hostNetwork: true
66+
dnsPolicy: ClusterFirstWithHostNet
67+
# Tolerate GPU node taints
68+
tolerations:
69+
- key: nvidia.com/gpu
70+
operator: Exists
71+
effect: NoSchedule
72+
affinity:
73+
nodeAffinity:
74+
requiredDuringSchedulingIgnoredDuringExecution:
75+
nodeSelectorTerms:
76+
- matchExpressions:
77+
# Require GPU nodes (A100 pools)
78+
- key: nvidia.com/gpu.present
79+
operator: In
80+
values:
81+
- "true"
82+
# Prefer stable instance types
83+
- key: node.kubernetes.io/instance-type
84+
operator: In
85+
values:
86+
- Standard_ND96amsr_A100_v4
87+
containers:
88+
- name: api
89+
# Replace with your Azure Container Registry (ACR)
90+
image: dynamoci.azurecr.io/fault-injection-api:latest
91+
imagePullPolicy: Always
92+
ports:
93+
- name: http
94+
containerPort: 8080
95+
protocol: TCP
96+
env:
97+
- name: PYTHONUNBUFFERED
98+
value: "1"
99+
livenessProbe:
100+
httpGet:
101+
path: /health
102+
port: 8080
103+
initialDelaySeconds: 10
104+
periodSeconds: 30
105+
readinessProbe:
106+
httpGet:
107+
path: /health
108+
port: 8080
109+
initialDelaySeconds: 5
110+
periodSeconds: 10
111+
resources:
112+
requests:
113+
memory: "256Mi"
114+
cpu: "100m"
115+
limits:
116+
memory: "1Gi"
117+
cpu: "500m"
118+
119+
---
120+
apiVersion: v1
121+
kind: Service
122+
metadata:
123+
name: fault-injection-api
124+
namespace: fault-injection-system
125+
labels:
126+
app: fault-injection-api
127+
spec:
128+
type: ClusterIP
129+
ports:
130+
- name: http
131+
port: 8080
132+
targetPort: 8080
133+
protocol: TCP
134+
selector:
135+
app: fault-injection-api
136+
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# ChaosMesh Setup for GPU Fault Injection
6+
# Install ChaosMesh if not already present
7+
---
8+
apiVersion: v1
9+
kind: Namespace
10+
metadata:
11+
name: chaos-mesh
12+
---
13+
# This is a placeholder - use Helm to install ChaosMesh
14+
#
15+
# Installation commands:
16+
#
17+
# helm repo add chaos-mesh https://charts.chaos-mesh.org
18+
# helm install chaos-mesh chaos-mesh/chaos-mesh -n chaos-mesh \
19+
# --set chaosDaemon.runtime=containerd \
20+
# --set chaosDaemon.socketPath=/run/containerd/containerd.sock \
21+
# --set dashboard.create=true \
22+
# --set dashboard.securityMode=false
23+
#
24+
# Verify installation:
25+
# kubectl get pods -n chaos-mesh
26+
#
27+
# Access dashboard:
28+
# kubectl port-forward -n chaos-mesh svc/chaos-dashboard 2333:2333
29+
# open http://localhost:2333
30+
---
31+
apiVersion: v1
32+
kind: ConfigMap
33+
metadata:
34+
name: chaos-mesh-gpu-experiments
35+
namespace: fault-injection-system
36+
data:
37+
README.md: |
38+
# ChaosMesh GPU Fault Injection
39+
40+
ChaosMesh provides the following chaos types for GPU fault injection:
41+
42+
## PodChaos
43+
- **pod-kill**: Kill GPU pods (simulates XID 79)
44+
- **container-kill**: Kill GPU containers
45+
- **pod-failure**: Make GPU pods unavailable
46+
47+
## StressChaos
48+
- **memory-stress**: Stress GPU node memory (simulates XID 48, 94, 95)
49+
- **cpu-stress**: Stress GPU node CPU (can trigger thermal issues)
50+
51+
## IOChaos
52+
- **fault**: Inject I/O errors on GPU devices
53+
- **latency**: Add I/O latency
54+
55+
## TimeChaos
56+
- **time-offset**: Offset system time (can trigger XID 119, 120 timeouts)
57+
58+
## NetworkChaos (for multi-GPU scenarios)
59+
- **partition**: Isolate GPU nodes
60+
- **loss**: Packet loss between GPU nodes (NVLink errors)
61+
- **delay**: Network delay
62+
63+
## Usage Examples
64+
65+
See gpu_chaos_mesh.py for programmatic injection via API.
66+
67+
Or use kubectl:
68+
69+
70+
```bash
71+
# Kill GPU pod
72+
kubectl apply -f - <<EOF
73+
apiVersion: chaos-mesh.org/v1alpha1
74+
kind: PodChaos
75+
metadata:
76+
name: gpu-pod-kill
77+
namespace: dynamo-oviya
78+
spec:
79+
action: pod-kill
80+
mode: one
81+
selector:
82+
namespaces: ["dynamo-oviya"]
83+
labelSelectors:
84+
app: vllm-worker
85+
nodeSelectors:
86+
kubernetes.io/hostname: <gpu-node-name>
87+
duration: 60s
88+
EOF
89+
```
90+
91+
```bash
92+
# Memory stress on GPU node
93+
kubectl apply -f - <<EOF
94+
apiVersion: chaos-mesh.org/v1alpha1
95+
kind: StressChaos
96+
metadata:
97+
name: gpu-memory-stress
98+
namespace: dynamo-oviya
99+
spec:
100+
mode: one
101+
selector:
102+
namespaces: ["dynamo-oviya"]
103+
nodeSelectors:
104+
kubernetes.io/hostname: <gpu-node-name>
105+
stressors:
106+
memory:
107+
workers: 4
108+
size: 8GB
109+
duration: 60s
110+
EOF
111+
```
112+

0 commit comments

Comments
 (0)