Skip to content

Commit d2c23e4

Browse files
authored
feat(fault-injection): Enable runtime CUDA fault injection toggling without pod restarts (#4679)
1 parent 300e5d5 commit d2c23e4

File tree

4 files changed

+433
-22
lines changed

4 files changed

+433
-22
lines changed

tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/README.md

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
77
## What This Does
88

9-
Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls.
9+
Intercepts CUDA calls to simulate GPU failures using LD_PRELOAD. Faults persist across pod restarts via hostPath volumes, enabling realistic hardware failure testing.
1010

1111
```
12-
Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes
12+
Pod calls cudaMalloc() → LD_PRELOAD intercepts → Checks /host-fault/cuda_fault_enabled → Returns error → Pod crashes
1313
```
1414

15-
**Result**: Realistic GPU failure testing without hardware damage.
15+
**Key Features**:
16+
- **Persistent faults**: hostPath volume (`/var/lib/cuda-fault-test`) survives pod restarts on same node
17+
- **Runtime toggle**: Enable/disable faults without pod restarts via `/host-fault/cuda_fault_enabled`
18+
- **Node-specific**: Faults only on target node, healthy nodes unaffected
1619

1720
## Scope
1821

@@ -35,13 +38,20 @@ This library simulates **software/orchestration-level failures** that occur when
3538
| **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
3639
| **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |
3740

41+
## How It Works
42+
43+
1. **Deployment patching**: Adds hostPath volume + init container to compile library
44+
2. **LD_PRELOAD injection**: Environment variable loads library before CUDA
45+
3. **Runtime control**: Toggle file (`/host-fault/cuda_fault_enabled`) controls fault state
46+
4. **Node persistence**: hostPath ensures faults survive pod restarts on same node
47+
3848
## Files in This Directory
3949

4050
| File | Purpose |
4151
|------|---------|
42-
| `cuda_intercept.c` | C library source that intercepts CUDA calls |
43-
| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments |
44-
| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) |
52+
| `cuda_intercept.c` | C library that intercepts CUDA calls and checks fault markers |
53+
| `inject_into_pods.py` | Kubernetes deployment patcher (adds hostPath volume + library) |
54+
| `Makefile` | Local build (optional, for testing) |
4555

4656
## Prerequisites
4757

tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/cuda_intercept.c

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,20 @@ static const xid_mapping_t xid_mappings[] = {
5959
};
6060

6161
// Get XID type and corresponding CUDA error
62+
// Supports runtime toggling via /tmp/cuda_fault_enabled file
6263
static void
6364
get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
6465
{
6566
static int initialized = 0;
66-
static int cached_inject = 0;
67+
static int env_inject = 0; // From environment variable
6768
static int cached_xid = 79; // Default to XID 79
6869
static cudaError_t cached_error = cudaErrorNoDevice;
6970

7071
if (!initialized) {
71-
// Check if injection is enabled
72+
// Check if injection is enabled via environment
7273
char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
7374
if (env) {
74-
cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
75+
env_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
7576
}
7677

7778
// Get XID type
@@ -85,8 +86,7 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
8586
if (xid_mappings[i].xid == cached_xid) {
8687
cached_error = xid_mappings[i].cuda_error;
8788
fprintf(
88-
stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid,
89-
xid_mappings[i].description);
89+
stderr, "[CUDA FAULT INJECTION] Library loaded - XID %d (%s)\n", cached_xid, xid_mappings[i].description);
9090
found = 1;
9191
break;
9292
}
@@ -97,16 +97,37 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
9797
cached_xid = 79;
9898
cached_error = cudaErrorNoDevice;
9999
}
100-
} else {
101-
fprintf(
102-
stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
103-
cached_inject ? "ENABLED" : "DISABLED");
104100
}
105101

106102
initialized = 1;
107103
}
108104

109-
*inject = cached_inject;
105+
// Runtime toggle: Check node-persistent fault marker on EVERY call
106+
// Use hostPath (/host-fault) so fault persists across pod restarts on same node
107+
// Pod reschedules to different node → no file there → automatic recovery!
108+
int runtime_inject = env_inject; // Default to env var
109+
110+
// Check hostPath first (persistent across restarts on same node)
111+
FILE* toggle_file = fopen("/host-fault/cuda_fault_enabled", "r");
112+
if (toggle_file) {
113+
char toggle_value[4] = {0};
114+
if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
115+
runtime_inject = (toggle_value[0] == '1');
116+
}
117+
fclose(toggle_file);
118+
} else {
119+
// Fallback to ephemeral /tmp for backwards compatibility
120+
toggle_file = fopen("/tmp/cuda_fault_enabled", "r");
121+
if (toggle_file) {
122+
char toggle_value[4] = {0};
123+
if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
124+
runtime_inject = (toggle_value[0] == '1');
125+
}
126+
fclose(toggle_file);
127+
}
128+
}
129+
130+
*inject = runtime_inject;
110131
*xid_type = cached_xid;
111132
*error_code = cached_error;
112133
}

tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/inject_into_pods.py

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,18 @@ def _patch_service_for_injection(
201201
{"name": "cuda-fault-lib", "emptyDir": {}}
202202
)
203203

204+
# Add hostPath volume for persistent fault marker (survives pod restarts on same node)
205+
# This simulates persistent hardware failure!
206+
service["extraPodSpec"]["volumes"].append(
207+
{
208+
"name": "node-fault-marker",
209+
"hostPath": {
210+
"path": "/var/lib/cuda-fault-test",
211+
"type": "DirectoryOrCreate",
212+
},
213+
}
214+
)
215+
204216
# Add init container to decode base64
205217
if "initContainers" not in service["extraPodSpec"]:
206218
service["extraPodSpec"]["initContainers"] = []
@@ -247,7 +259,7 @@ def _patch_service_for_injection(
247259
if vm.get("name") != "cuda-fault-lib"
248260
]
249261

250-
# Add mount
262+
# Add mount for compiled library
251263
service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
252264
{
253265
"name": "cuda-fault-lib",
@@ -256,8 +268,18 @@ def _patch_service_for_injection(
256268
}
257269
)
258270

271+
# Add mount for persistent fault marker (hostPath)
272+
service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
273+
{
274+
"name": "node-fault-marker",
275+
"mountPath": "/host-fault",
276+
"readOnly": False, # Need write access
277+
}
278+
)
279+
259280
print(" ✓ Added init container to compile library")
260281
print(" ✓ Added ConfigMap volume mount")
282+
print(" ✓ Added hostPath volume for persistent fault marker")
261283

262284
# Add node affinity to pin pods to target node (simulates real XID 79 behavior)
263285
if target_node and enable:
@@ -287,14 +309,15 @@ def _patch_service_for_injection(
287309
service["extraPodSpec"]["volumes"] = [
288310
v
289311
for v in service["extraPodSpec"]["volumes"]
290-
if v.get("name") not in ["cuda-fault-lib", "cuda-fault-lib-source"]
312+
if v.get("name")
313+
not in ["cuda-fault-lib", "cuda-fault-lib-source", "node-fault-marker"]
291314
]
292315

293316
if "volumeMounts" in service["extraPodSpec"].get("mainContainer", {}):
294317
service["extraPodSpec"]["mainContainer"]["volumeMounts"] = [
295318
vm
296319
for vm in service["extraPodSpec"]["mainContainer"]["volumeMounts"]
297-
if vm.get("name") != "cuda-fault-lib"
320+
if vm.get("name") not in ["cuda-fault-lib", "node-fault-marker"]
298321
]
299322

300323
# Remove init container
@@ -323,6 +346,7 @@ def patch_deployment_env(
323346
use_configmap=True,
324347
target_node=None,
325348
xid_type=79,
349+
passthrough_mode=False,
326350
):
327351
"""Patch deployment to add/remove LD_PRELOAD environment variable.
328352
@@ -334,6 +358,8 @@ def patch_deployment_env(
334358
target_node: If provided, adds node affinity to pin pods to this node
335359
(simulates real XID where pods crash on the faulty node)
336360
xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
361+
passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0 (library loaded but disabled)
362+
Allows baseline testing before enabling faults via toggle
337363
"""
338364
custom_api = client.CustomObjectsApi()
339365
apps_api = client.AppsV1Api()
@@ -385,9 +411,14 @@ def patch_deployment_env(
385411
# Prepare environment variables
386412
new_envs = []
387413
if enable:
414+
# Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
415+
fault_enabled_value = "0" if passthrough_mode else "1"
388416
new_envs = [
389417
{"name": "LD_PRELOAD", "value": lib_path},
390-
{"name": "CUDA_FAULT_INJECTION_ENABLED", "value": "1"},
418+
{
419+
"name": "CUDA_FAULT_INJECTION_ENABLED",
420+
"value": fault_enabled_value,
421+
},
391422
{"name": "CUDA_XID_TYPE", "value": str(xid_type)},
392423
]
393424

@@ -400,6 +431,28 @@ def patch_deployment_env(
400431
available_services = list(services.keys())
401432
print(f" → Available services: {available_services}")
402433

434+
# Set aggressive update strategy when enabling (allow all pods to update at once)
435+
# This ensures all pods get CUDA faults, not just the first few
436+
if enable:
437+
if "updateStrategy" not in spec:
438+
spec["updateStrategy"] = {}
439+
if "rollingUpdate" not in spec["updateStrategy"]:
440+
spec["updateStrategy"]["rollingUpdate"] = {}
441+
442+
# Allow all pods to be unavailable during update
443+
spec["updateStrategy"]["rollingUpdate"]["maxUnavailable"] = "100%"
444+
# Don't create surge pods
445+
spec["updateStrategy"]["rollingUpdate"]["maxSurge"] = 0
446+
print(" → Set update strategy: maxUnavailable=100%, maxSurge=0")
447+
print(" (All pods will update simultaneously)")
448+
else:
449+
# Restore default update strategy when disabling
450+
if "updateStrategy" in spec:
451+
spec["updateStrategy"] = {
452+
"rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}
453+
}
454+
print(" → Restored default update strategy (maxUnavailable=25%)")
455+
403456
for service_name in services_to_patch:
404457
if service_name in services:
405458
print(f" → Patching service: {service_name}")
@@ -465,6 +518,38 @@ def patch_deployment_env(
465518
print(f" Services patched: {', '.join(patched_services)}")
466519
if use_configmap and enable:
467520
print(f" Library mounted at: {lib_path}")
521+
522+
# Force restart all worker pods when enabling to apply changes immediately
523+
if enable:
524+
print(
525+
" → Force-deleting all worker pods to apply changes immediately..."
526+
)
527+
core_api = client.CoreV1Api()
528+
try:
529+
worker_pods = core_api.list_namespaced_pod(
530+
namespace=namespace,
531+
label_selector=f"nvidia.com/dynamo-graph-deployment-name={deployment_name},nvidia.com/dynamo-component-type=worker",
532+
)
533+
deleted_count = 0
534+
for pod in worker_pods.items:
535+
try:
536+
core_api.delete_namespaced_pod(
537+
name=pod.metadata.name,
538+
namespace=namespace,
539+
grace_period_seconds=0,
540+
)
541+
deleted_count += 1
542+
except Exception as e:
543+
print(
544+
f" ⚠ Could not delete pod {pod.metadata.name}: {e}"
545+
)
546+
print(
547+
f" ✓ Deleted {deleted_count} pod(s) - they will restart with CUDA library"
548+
)
549+
except Exception as e:
550+
print(f" ⚠ Could not list/delete pods: {e}")
551+
print(" Pods will eventually restart, but may take longer")
552+
468553
return True
469554

470555
except ApiException as e:
@@ -505,11 +590,15 @@ def patch_deployment_env(
505590

506591
if enable:
507592
# Add new env vars
593+
# Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
594+
fault_enabled_value = "0" if passthrough_mode else "1"
508595
container.env.append(
509596
client.V1EnvVar(name="LD_PRELOAD", value="/tmp/cuda_intercept.so")
510597
)
511598
container.env.append(
512-
client.V1EnvVar(name="CUDA_FAULT_INJECTION_ENABLED", value="1")
599+
client.V1EnvVar(
600+
name="CUDA_FAULT_INJECTION_ENABLED", value=fault_enabled_value
601+
)
513602
)
514603
container.env.append(
515604
client.V1EnvVar(name="CUDA_XID_TYPE", value=str(xid_type))

0 commit comments

Comments
 (0)