ai-dynamo
diff --git a/‎tests/fault_tolerance/hardware/fault-injection-service/deploy/api-service.yaml‎
Lines changed: 136 additions & 0 deletions b/‎tests/fault_tolerance/hardware/fault-injection-service/deploy/api-service.yaml‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎tests/fault_tolerance/hardware/fault-injection-service/deploy/chaos-mesh-gpu.yaml‎
Lines changed: 112 additions & 0 deletions b/‎tests/fault_tolerance/hardware/fault-injection-service/deploy/chaos-mesh-gpu.yaml‎
Lines changed: 112 additions & 0 deletions
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: fault-injection-api
+  namespace: fault-injection-system
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: fault-injection-api
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods", "services"]
+  verbs: ["get", "list", "watch", "patch"]
+- apiGroups: ["apps"]
+  resources: ["deployments", "daemonsets", "statefulsets"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["networking.k8s.io"]
+  resources: ["networkpolicies"]
+  verbs: ["get", "list", "create", "delete"]
+- apiGroups: ["chaos-mesh.org"]
+  resources: ["networkchaos", "podchaos", "stresschaos", "iochaos"]
+  verbs: ["get", "list", "create", "delete", "watch"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: fault-injection-api
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: fault-injection-api
+subjects:
+- kind: ServiceAccount
+  name: fault-injection-api
+  namespace: fault-injection-system
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: fault-injection-api
+  namespace: fault-injection-system
+  labels:
+    app: fault-injection-api
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: fault-injection-api
+  template:
+    metadata:
+      labels:
+        app: fault-injection-api
+    spec:
+      serviceAccountName: fault-injection-api
+      # Use host network to communicate with hostNetwork agents
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet
+      # Tolerate GPU node taints
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              # Require GPU nodes (A100 pools)
+              - key: nvidia.com/gpu.present
+                operator: In
+                values:
+                - "true"
+              # Prefer stable instance types
+              - key: node.kubernetes.io/instance-type
+                operator: In
+                values:
+                - Standard_ND96amsr_A100_v4
+      containers:
+      - name: api
+        # Replace with your Azure Container Registry (ACR)
+        image: dynamoci.azurecr.io/fault-injection-api:latest
+        imagePullPolicy: Always
+        ports:
+        - name: http
+          containerPort: 8080
+          protocol: TCP
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 10
+          periodSeconds: 30
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: fault-injection-api
+  namespace: fault-injection-system
+  labels:
+    app: fault-injection-api
+spec:
+  type: ClusterIP
+  ports:
+  - name: http
+    port: 8080
+    targetPort: 8080
+    protocol: TCP
+  selector:
+    app: fault-injection-api
+
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# ChaosMesh Setup for GPU Fault Injection
+# Install ChaosMesh if not already present
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: chaos-mesh
+---
+# This is a placeholder - use Helm to install ChaosMesh
+#
+# Installation commands:
+#
+# helm repo add chaos-mesh https://charts.chaos-mesh.org
+# helm install chaos-mesh chaos-mesh/chaos-mesh -n chaos-mesh \
+#   --set chaosDaemon.runtime=containerd \
+#   --set chaosDaemon.socketPath=/run/containerd/containerd.sock \
+#   --set dashboard.create=true \
+#   --set dashboard.securityMode=false
+#
+# Verify installation:
+#   kubectl get pods -n chaos-mesh
+#
+# Access dashboard:
+#   kubectl port-forward -n chaos-mesh svc/chaos-dashboard 2333:2333
+#   open http://localhost:2333
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chaos-mesh-gpu-experiments
+  namespace: fault-injection-system
+data:
+  README.md: |
+    # ChaosMesh GPU Fault Injection
+
+    ChaosMesh provides the following chaos types for GPU fault injection:
+
+    ## PodChaos
+    - **pod-kill**: Kill GPU pods (simulates XID 79)
+    - **container-kill**: Kill GPU containers
+    - **pod-failure**: Make GPU pods unavailable
+
+    ## StressChaos
+    - **memory-stress**: Stress GPU node memory (simulates XID 48, 94, 95)
+    - **cpu-stress**: Stress GPU node CPU (can trigger thermal issues)
+
+    ## IOChaos
+    - **fault**: Inject I/O errors on GPU devices
+    - **latency**: Add I/O latency
+
+    ## TimeChaos
+    - **time-offset**: Offset system time (can trigger XID 119, 120 timeouts)
+
+    ## NetworkChaos (for multi-GPU scenarios)
+    - **partition**: Isolate GPU nodes
+    - **loss**: Packet loss between GPU nodes (NVLink errors)
+    - **delay**: Network delay
+
+    ## Usage Examples
+
+    See gpu_chaos_mesh.py for programmatic injection via API.
+
+    Or use kubectl:
+
+
+    ```bash
+    # Kill GPU pod
+    kubectl apply -f - <<EOF
+    apiVersion: chaos-mesh.org/v1alpha1
+    kind: PodChaos
+    metadata:
+      name: gpu-pod-kill
+      namespace: dynamo-oviya
+    spec:
+      action: pod-kill
+      mode: one
+      selector:
+        namespaces: ["dynamo-oviya"]
+        labelSelectors:
+          app: vllm-worker
+        nodeSelectors:
+          kubernetes.io/hostname: <gpu-node-name>
+      duration: 60s
+    EOF
+    ```
+
+    ```bash
+    # Memory stress on GPU node
+    kubectl apply -f - <<EOF
+    apiVersion: chaos-mesh.org/v1alpha1
+    kind: StressChaos
+    metadata:
+      name: gpu-memory-stress
+      namespace: dynamo-oviya
+    spec:
+      mode: one
+      selector:
+        namespaces: ["dynamo-oviya"]
+        nodeSelectors:
+          kubernetes.io/hostname: <gpu-node-name>
+      stressors:
+        memory:
+          workers: 4
+          size: 8GB
+      duration: 60s
+    EOF
+    ```
+