NVIDIA · javierdejesusda · Mar 23, 2026
@@ -988,6 +988,8 @@ def forward(
                     batch_size, seq_len_s, device=eagle_input_hiddens.device
                 ).argsort(dim=1)[:, :num_to_replace]
 
+                # Clone to avoid inplace modification that breaks autograd
+                eagle_input_hiddens = eagle_input_hiddens.clone()
                 batch_indices = torch.arange(batch_size)[:, None]
                 eagle_input_hiddens[batch_indices, rand_indices] = eagle_output_hiddens[
                     batch_indices, rand_indices

diff --git a/tests/unit/torch/speculative/plugins/test_hf_speculative.py b/tests/unit/torch/speculative/plugins/test_hf_speculative.py
@@ -17,6 +17,7 @@
 from copy import deepcopy
 
 import pytest
+import torch
 from _test_utils.torch.transformers_models import (
     get_tiny_llama,
     tf_modelopt_state_and_output_tester,
@@ -48,3 +49,39 @@ def test_eagle_model_convert_save_and_restore(tmp_path, eagle_config):
     model_test = AutoModelForCausalLM.from_pretrained(tmp_path / "modelopt_model")
     assert isinstance(model_test, mtsp.plugins.HFEagleModel)
     tf_modelopt_state_and_output_tester(model_ref, model_test)
+
+
+@pytest.mark.parametrize("eagle_config", [EAGLE3_DEFAULT_CFG])
+@pytest.mark.parametrize("eagle_ttt_steps", [1, 2])
+def test_eagle_mix_hidden_states_backward(eagle_config, eagle_ttt_steps):
+    """Regression test for GitHub issue #1088.
+
+    Verifies that the EAGLE training forward+backward pass does not crash with
+    ``eagle_mix_hidden_states=True`` due to an in-place tensor modification
+    breaking autograd.
+    """
+    model = get_tiny_llama(num_hidden_layers=8)
+
+    config = deepcopy(eagle_config["config"])
+    config["eagle_architecture_config"].update(
+        {
+            "draft_vocab_size": model.config.vocab_size,
+            "hidden_size": model.config.hidden_size,
+        }
+    )
+    config["eagle_mix_hidden_states"] = True
+    config["eagle_ttt_steps"] = eagle_ttt_steps
+    config["eagle_use_torch_compile"] = False
+
+    mtsp.convert(model, mode=[("eagle", config)])
+    model.train()
+
+    input_ids = torch.randint(0, model.config.vocab_size, (2, 16))
+    labels = input_ids.clone()
+
+    outputs = model(input_ids=input_ids, labels=labels)
+    assert outputs.loss is not None
+    outputs.loss.backward()
+
+    eagle_grads = [p.grad for p in model.eagle_module.parameters() if p.grad is not None]
+    assert len(eagle_grads) > 0, "Expected gradients to flow to eagle_module"