[webgpu] add support of output_qk for MHA (#26553)

fs-eire · Copilot · web-flow · commit 91a9d0200d6f · 2025-11-18T18:36:21.000-08:00
### Description

WebGPU EP does not support MHA's `qk` output yet. This PR makes it
handling `qk` output correctly.

---------

Co-authored-by: Copilot &lt;198982749+Copilot@users.noreply.github.com&gt;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention.cc b/onnxruntime/contrib_ops/webgpu/bert/attention.cc
@@ -520,8 +520,8 @@ Status ComputeVxAttentionScore(onnxruntime::webgpu::ComputeContext& context, int
 
 Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                       const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
-                      WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* head_sink,
-                      const Tensor* seqlen_k, int local_window_size) {
+                      Tensor* output_qk, WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context,
+                      const Tensor* head_sink, const Tensor* seqlen_k, int local_window_size) {
   const int output_count = std::min({context.OutputCount(), 1 + (past_key != nullptr ? 1 : 0) + (past_value != nullptr ? 1 : 0)});
   const int past_sequence_length = output_count > 1 ? parameters.past_sequence_length_ : 0;
   const int total_sequence_length =
@@ -534,6 +534,11 @@ Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const T
   ORT_RETURN_IF_ERROR(ComputeAttentionProbs(context, output_count, Q, K, past_key, attention_bias, &probs, present_key,
                                             parameters, past_sequence_length, total_sequence_length, seqlen_k));
 
+  if (output_qk != nullptr) {
+    // Copy the attention scores (scaled Q*K^T) to output_qk
+    ORT_RETURN_IF_ERROR(context.CopyTensor(probs, *output_qk));
+  }
+
   ORT_RETURN_IF_ERROR(ComputeInPlaceSoftmax(context, &probs,
                                             parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_, parameters.use_smooth_softmax_, head_sink, local_window_size));
 
@@ -730,7 +735,7 @@ Status Attention::ComputeInternal(onnxruntime::webgpu::ComputeContext& context)
 
   // Apply the actual attention computation
   return ApplyAttention(&Q, &K, &V, attention_bias, nullptr, nullptr, output, /* present_key */ nullptr,
-                        /* present_value */ nullptr, parameters, context, nullptr, nullptr, -1);
+                        /* present_value */ nullptr, /* output_qk */ nullptr, parameters, context, nullptr, nullptr, -1);
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention_common.h b/onnxruntime/contrib_ops/webgpu/bert/attention_common.h
@@ -124,7 +124,7 @@ Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_h
 
 Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                       const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
-                      WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context,
+                      Tensor* output_qk, WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context,
                       const Tensor* head_sink = nullptr, const Tensor* seqlen_k = nullptr, int local_window_size = -1);
 
 }  // namespace webgpu
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -321,7 +321,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
       context, parameters.num_heads_, parameters.sequence_length_, parameters.head_size_, query, nullptr, 0, &Q));
   if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {  // key and value in BNSH format
     return ApplyAttention(&Q, key, value, attention_bias, past_key, past_value, output, present_key,
-                          present_value, parameters, context, head_sink, seqlen_k, local_window_size_);
+                          present_value, nullptr, parameters, context, head_sink, seqlen_k, local_window_size_);
   }
 
   TensorShapeVector k_new_dims({parameters.batch_size_, parameters.kv_num_heads_,
@@ -338,7 +338,7 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   ORT_RETURN_IF_ERROR(TransferBSDToBNSH(context, parameters.kv_num_heads_, parameters.kv_sequence_length_,
                                         parameters.v_head_size_, value, nullptr, 0, &V));
   return ApplyAttention(&Q, &K, &V, attention_bias, past_key, past_value, output, present_key,
-                        present_value, parameters, context, head_sink, seqlen_k, local_window_size_);
+                        present_value, nullptr, parameters, context, head_sink, seqlen_k, local_window_size_);
 }
 
 KernelCreateInfo CreateGroupQueryAttentionKernelInfo(bool enable_graph_capture) {
diff --git a/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/multihead_attention.cc
@@ -94,7 +94,17 @@ Status MultiHeadAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   Tensor* present_key = context.Output(1, present_shape);
   Tensor* present_value = context.Output(2, present_shape);
 
-  if (CanApplyFlashAttention(bias, present_key, present_value, parameters, context)) {
+  std::vector<int64_t> output_qk_dims{
+      parameters.batch_size_,
+      parameters.num_heads_,
+      parameters.sequence_length_,
+      parameters.total_sequence_length_,
+  };
+  TensorShape output_qk_shape(output_qk_dims);
+  Tensor* output_qk = context.Output(3, output_qk_shape);
+
+  if (output_qk == nullptr &&  // Flash attention does not output QK scores
+      CanApplyFlashAttention(bias, present_key, present_value, parameters, context)) {
     return ApplyFlashAttention(query, key, value, attention_bias, output, past_key, present_key, past_value,
                                present_value, parameters, context);
   }
@@ -108,7 +118,7 @@ Status MultiHeadAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
 
   if (parameters.qkv_format_ == Q_K_V_BSNH_BNSH_BNSH) {  // key and value in BNSH format
     return ApplyAttention(&Q, key, value, attention_bias, past_key, past_value, output, present_key,
-                          present_value, parameters, context);
+                          present_value, output_qk, parameters, context);
   }
 
   TensorShapeVector k_new_dims({parameters.batch_size_, parameters.num_heads_,
@@ -127,7 +137,7 @@ Status MultiHeadAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
 
   // Compute the attention score and apply the score to V
   return ApplyAttention(&Q, &K, &V, attention_bias, past_key, past_value, output, present_key,
-                        present_value, parameters, context);
+                        present_value, output_qk, parameters, context);
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/compute_context.cc b/onnxruntime/core/providers/webgpu/compute_context.cc
@@ -6,9 +6,13 @@
 
 namespace onnxruntime {
 namespace webgpu {
-ComputeContext::ComputeContext(OpKernelContext& kernel_context, const WebGpuExecutionProvider& ep, WebGpuContext& webgpu_context)
+ComputeContext::ComputeContext(OpKernelContext& kernel_context,
+                               const OpKernel& op_kernel,
+                               const WebGpuExecutionProvider& ep,
+                               WebGpuContext& webgpu_context)
     : webgpu_context_{webgpu_context},
       kernel_context_{kernel_context},
+      op_kernel_{op_kernel},
       ep_{ep} {
 }
 
diff --git a/onnxruntime/core/providers/webgpu/compute_context.h b/onnxruntime/core/providers/webgpu/compute_context.h
@@ -7,6 +7,7 @@
 
 #include <utility>
 
+#include "core/framework/data_transfer_manager.h"
 #include "core/framework/execution_provider.h"
 #include "core/providers/webgpu/webgpu_execution_provider.h"
 
@@ -36,7 +37,10 @@ class ComputeContext final {
     static const webgpu::BufferManager& Get(const ComputeContext& context);
   };
 
-  ComputeContext(OpKernelContext& kernel_context, const WebGpuExecutionProvider& ep, WebGpuContext& webgpu_context);
+  ComputeContext(OpKernelContext& kernel_context,
+                 const OpKernel& op_kernel,
+                 const WebGpuExecutionProvider& ep,
+                 WebGpuContext& webgpu_context);
 
   ~ComputeContext() = default;
 
@@ -132,6 +136,15 @@ class ComputeContext final {
     return {data_type, std::forward<TensorShapeType>(shape), allocator};
   }
 
+  //
+  // Copy data from a tensor to another tensor.
+  //
+  // This method assumes that both tensors have the same data size.
+  //
+  inline Status CopyTensor(const Tensor& src, Tensor& dst) {
+    return op_kernel_.Info().GetDataTransferManager().CopyTensor(src, dst);
+  }
+
   //
   // Run a compute shader program.
   //
@@ -142,6 +155,7 @@ class ComputeContext final {
  private:
   WebGpuContext& webgpu_context_;
   OpKernelContext& kernel_context_;
+  const OpKernel& op_kernel_;
   const WebGpuExecutionProvider& ep_;
 };
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_kernel.cc b/onnxruntime/core/providers/webgpu/webgpu_kernel.cc
@@ -16,7 +16,7 @@ WebGpuKernel::WebGpuKernel(const OpKernelInfo& info)
 
 Status WebGpuKernel::Compute(OpKernelContext* p_op_kernel_context) const {
   WebGpuContext& webgpu_context = WebGpuContextFactory::GetContext(ep_.GetDeviceId());
-  ComputeContext context{*p_op_kernel_context, ep_, webgpu_context};
+  ComputeContext context{*p_op_kernel_context, *this, ep_, webgpu_context};
 
   if (webgpu_context.ValidationMode() >= ValidationMode::Full) {
     webgpu_context.PushErrorScope();