feat(fault-injection): Enable runtime CUDA fault injection toggling without pod restarts (#4679)

nv-oviya · web-flow · commit d2c23e418f09 · 2025-12-08T12:45:54.000-08:00
diff --git a/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/README.md b/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/README.md
@@ -6,13 +6,16 @@
 
 ## What This Does
 
-Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls.
+Intercepts CUDA calls to simulate GPU failures using LD_PRELOAD. Faults persist across pod restarts via hostPath volumes, enabling realistic hardware failure testing.
 
 ```
-Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes
+Pod calls cudaMalloc() → LD_PRELOAD intercepts → Checks /host-fault/cuda_fault_enabled → Returns error → Pod crashes
 ```
 
-**Result**: Realistic GPU failure testing without hardware damage.
+**Key Features**:
+- **Persistent faults**: hostPath volume (`/var/lib/cuda-fault-test`) survives pod restarts on same node
+- **Runtime toggle**: Enable/disable faults without pod restarts via `/host-fault/cuda_fault_enabled`
+- **Node-specific**: Faults only on target node, healthy nodes unaffected
 
 ## Scope
 
@@ -35,13 +38,20 @@ This library simulates **software/orchestration-level failures** that occur when
 | **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
 | **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |
 
+## How It Works
+
+1. **Deployment patching**: Adds hostPath volume + init container to compile library
+2. **LD_PRELOAD injection**: Environment variable loads library before CUDA
+3. **Runtime control**: Toggle file (`/host-fault/cuda_fault_enabled`) controls fault state
+4. **Node persistence**: hostPath ensures faults survive pod restarts on same node
+
 ## Files in This Directory
 
 | File | Purpose |
 |------|---------|
-| `cuda_intercept.c` | C library source that intercepts CUDA calls |
-| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments |
-| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) |
+| `cuda_intercept.c` | C library that intercepts CUDA calls and checks fault markers |
+| `inject_into_pods.py` | Kubernetes deployment patcher (adds hostPath volume + library) |
+| `Makefile` | Local build (optional, for testing) |
 
 ## Prerequisites
 
diff --git a/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/cuda_intercept.c b/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/cuda_intercept.c
@@ -59,19 +59,20 @@ static const xid_mapping_t xid_mappings[] = {
 };
 
 // Get XID type and corresponding CUDA error
+// Supports runtime toggling via /tmp/cuda_fault_enabled file
 static void
 get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
 {
   static int initialized = 0;
-  static int cached_inject = 0;
+  static int env_inject = 0;   // From environment variable
   static int cached_xid = 79;  // Default to XID 79
   static cudaError_t cached_error = cudaErrorNoDevice;
 
   if (!initialized) {
-    // Check if injection is enabled
+    // Check if injection is enabled via environment
     char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
     if (env) {
-      cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
+      env_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
     }
 
     // Get XID type
@@ -85,8 +86,7 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
         if (xid_mappings[i].xid == cached_xid) {
           cached_error = xid_mappings[i].cuda_error;
           fprintf(
-              stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid,
-              xid_mappings[i].description);
+              stderr, "[CUDA FAULT INJECTION] Library loaded - XID %d (%s)\n", cached_xid, xid_mappings[i].description);
           found = 1;
           break;
         }
@@ -97,16 +97,37 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
         cached_xid = 79;
         cached_error = cudaErrorNoDevice;
       }
-    } else {
-      fprintf(
-          stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
-          cached_inject ? "ENABLED" : "DISABLED");
     }
 
     initialized = 1;
   }
 
-  *inject = cached_inject;
+  // Runtime toggle: Check node-persistent fault marker on EVERY call
+  // Use hostPath (/host-fault) so fault persists across pod restarts on same node
+  // Pod reschedules to different node → no file there → automatic recovery!
+  int runtime_inject = env_inject;  // Default to env var
+
+  // Check hostPath first (persistent across restarts on same node)
+  FILE* toggle_file = fopen("/host-fault/cuda_fault_enabled", "r");
+  if (toggle_file) {
+    char toggle_value[4] = {0};
+    if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
+      runtime_inject = (toggle_value[0] == '1');
+    }
+    fclose(toggle_file);
+  } else {
+    // Fallback to ephemeral /tmp for backwards compatibility
+    toggle_file = fopen("/tmp/cuda_fault_enabled", "r");
+    if (toggle_file) {
+      char toggle_value[4] = {0};
+      if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
+        runtime_inject = (toggle_value[0] == '1');
+      }
+      fclose(toggle_file);
+    }
+  }
+
+  *inject = runtime_inject;
   *xid_type = cached_xid;
   *error_code = cached_error;
 }
diff --git a/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/inject_into_pods.py b/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/inject_into_pods.py
@@ -201,6 +201,18 @@ def _patch_service_for_injection(
             {"name": "cuda-fault-lib", "emptyDir": {}}
         )
 
+        # Add hostPath volume for persistent fault marker (survives pod restarts on same node)
+        # This simulates persistent hardware failure!
+        service["extraPodSpec"]["volumes"].append(
+            {
+                "name": "node-fault-marker",
+                "hostPath": {
+                    "path": "/var/lib/cuda-fault-test",
+                    "type": "DirectoryOrCreate",
+                },
+            }
+        )
+
         # Add init container to decode base64
         if "initContainers" not in service["extraPodSpec"]:
             service["extraPodSpec"]["initContainers"] = []
@@ -247,7 +259,7 @@ def _patch_service_for_injection(
             if vm.get("name") != "cuda-fault-lib"
         ]
 
-        # Add mount
+        # Add mount for compiled library
         service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
             {
                 "name": "cuda-fault-lib",
@@ -256,8 +268,18 @@ def _patch_service_for_injection(
             }
         )
 
+        # Add mount for persistent fault marker (hostPath)
+        service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
+            {
+                "name": "node-fault-marker",
+                "mountPath": "/host-fault",
+                "readOnly": False,  # Need write access
+            }
+        )
+
         print("      ✓ Added init container to compile library")
         print("      ✓ Added ConfigMap volume mount")
+        print("      ✓ Added hostPath volume for persistent fault marker")
 
     # Add node affinity to pin pods to target node (simulates real XID 79 behavior)
     if target_node and enable:
@@ -287,14 +309,15 @@ def _patch_service_for_injection(
             service["extraPodSpec"]["volumes"] = [
                 v
                 for v in service["extraPodSpec"]["volumes"]
-                if v.get("name") not in ["cuda-fault-lib", "cuda-fault-lib-source"]
+                if v.get("name")
+                not in ["cuda-fault-lib", "cuda-fault-lib-source", "node-fault-marker"]
             ]
 
         if "volumeMounts" in service["extraPodSpec"].get("mainContainer", {}):
             service["extraPodSpec"]["mainContainer"]["volumeMounts"] = [
                 vm
                 for vm in service["extraPodSpec"]["mainContainer"]["volumeMounts"]
-                if vm.get("name") != "cuda-fault-lib"
+                if vm.get("name") not in ["cuda-fault-lib", "node-fault-marker"]
             ]
 
         # Remove init container
@@ -323,6 +346,7 @@ def patch_deployment_env(
     use_configmap=True,
     target_node=None,
     xid_type=79,
+    passthrough_mode=False,
 ):
     """Patch deployment to add/remove LD_PRELOAD environment variable.
 
@@ -334,6 +358,8 @@ def patch_deployment_env(
         target_node: If provided, adds node affinity to pin pods to this node
                     (simulates real XID where pods crash on the faulty node)
         xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
+        passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0 (library loaded but disabled)
+                         Allows baseline testing before enabling faults via toggle
     """
     custom_api = client.CustomObjectsApi()
     apps_api = client.AppsV1Api()
@@ -385,9 +411,14 @@ def patch_deployment_env(
             # Prepare environment variables
             new_envs = []
             if enable:
+                # Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
+                fault_enabled_value = "0" if passthrough_mode else "1"
                 new_envs = [
                     {"name": "LD_PRELOAD", "value": lib_path},
-                    {"name": "CUDA_FAULT_INJECTION_ENABLED", "value": "1"},
+                    {
+                        "name": "CUDA_FAULT_INJECTION_ENABLED",
+                        "value": fault_enabled_value,
+                    },
                     {"name": "CUDA_XID_TYPE", "value": str(xid_type)},
                 ]
 
@@ -400,6 +431,28 @@ def patch_deployment_env(
             available_services = list(services.keys())
             print(f"    → Available services: {available_services}")
 
+            # Set aggressive update strategy when enabling (allow all pods to update at once)
+            # This ensures all pods get CUDA faults, not just the first few
+            if enable:
+                if "updateStrategy" not in spec:
+                    spec["updateStrategy"] = {}
+                if "rollingUpdate" not in spec["updateStrategy"]:
+                    spec["updateStrategy"]["rollingUpdate"] = {}
+
+                # Allow all pods to be unavailable during update
+                spec["updateStrategy"]["rollingUpdate"]["maxUnavailable"] = "100%"
+                # Don't create surge pods
+                spec["updateStrategy"]["rollingUpdate"]["maxSurge"] = 0
+                print("    → Set update strategy: maxUnavailable=100%, maxSurge=0")
+                print("       (All pods will update simultaneously)")
+            else:
+                # Restore default update strategy when disabling
+                if "updateStrategy" in spec:
+                    spec["updateStrategy"] = {
+                        "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}
+                    }
+                    print("    → Restored default update strategy (maxUnavailable=25%)")
+
             for service_name in services_to_patch:
                 if service_name in services:
                     print(f"    → Patching service: {service_name}")
@@ -465,6 +518,38 @@ def patch_deployment_env(
             print(f"    Services patched: {', '.join(patched_services)}")
             if use_configmap and enable:
                 print(f"    Library mounted at: {lib_path}")
+
+            # Force restart all worker pods when enabling to apply changes immediately
+            if enable:
+                print(
+                    "    → Force-deleting all worker pods to apply changes immediately..."
+                )
+                core_api = client.CoreV1Api()
+                try:
+                    worker_pods = core_api.list_namespaced_pod(
+                        namespace=namespace,
+                        label_selector=f"nvidia.com/dynamo-graph-deployment-name={deployment_name},nvidia.com/dynamo-component-type=worker",
+                    )
+                    deleted_count = 0
+                    for pod in worker_pods.items:
+                        try:
+                            core_api.delete_namespaced_pod(
+                                name=pod.metadata.name,
+                                namespace=namespace,
+                                grace_period_seconds=0,
+                            )
+                            deleted_count += 1
+                        except Exception as e:
+                            print(
+                                f"      ⚠ Could not delete pod {pod.metadata.name}: {e}"
+                            )
+                    print(
+                        f"    ✓ Deleted {deleted_count} pod(s) - they will restart with CUDA library"
+                    )
+                except Exception as e:
+                    print(f"      ⚠ Could not list/delete pods: {e}")
+                    print("         Pods will eventually restart, but may take longer")
+
             return True
 
         except ApiException as e:
@@ -505,11 +590,15 @@ def patch_deployment_env(
 
         if enable:
             # Add new env vars
+            # Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
+            fault_enabled_value = "0" if passthrough_mode else "1"
             container.env.append(
                 client.V1EnvVar(name="LD_PRELOAD", value="/tmp/cuda_intercept.so")
             )
             container.env.append(
-                client.V1EnvVar(name="CUDA_FAULT_INJECTION_ENABLED", value="1")
+                client.V1EnvVar(
+                    name="CUDA_FAULT_INJECTION_ENABLED", value=fault_enabled_value
+                )
             )
             container.env.append(
                 client.V1EnvVar(name="CUDA_XID_TYPE", value=str(xid_type))
diff --git a/tests/fault_tolerance/hardware/fault_injection_service/helpers/cuda_fault_injection.py b/tests/fault_tolerance/hardware/fault_injection_service/helpers/cuda_fault_injection.py