Skip to content

Commit b3ba580

Browse files
authored
[webgpu] Use DoTranspose directly in Conv Op (#26074)
### Description <!-- Describe your changes. --> This refactors the `TransposeKernel` to call `Transpose::DoTranspose` directly. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> See above.
1 parent cd3aa49 commit b3ba580

File tree

2 files changed

+35
-45
lines changed

2 files changed

+35
-45
lines changed

onnxruntime/core/providers/webgpu/nn/conv.cc

Lines changed: 2 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,6 @@
99
#include "core/providers/webgpu/webgpu_utils.h"
1010
#include "core/providers/webgpu/math/matmul.h"
1111

12-
namespace {
13-
14-
inline uint32_t ceil_div(int64_t numerator, int32_t denominator) {
15-
return static_cast<uint32_t>((numerator + denominator - 1) / denominator);
16-
}
17-
18-
} // namespace
19-
2012
namespace onnxruntime {
2113
namespace webgpu {
2214

@@ -27,37 +19,10 @@ Status TransposeKernel(ComputeContext& context, const Tensor* kernel, const Tens
2719
for (size_t i = 0; i < rank; ++i) {
2820
transposed_kernel_shape_vector[i] = kernel_shape[perm[i]];
2921
}
30-
uint32_t output_size = onnxruntime::narrow<uint32_t>(kernel_shape.Size());
31-
32-
uint32_t dispatch_x = ceil_div(output_size, 64);
33-
uint32_t dispatch_y = 1;
34-
uint32_t dispatch_z = 1;
35-
36-
// This temporary workaround addresses a significant performance bottleneck
37-
// (~12x slower) for the shape (3, 3, 2560, 1280) due to an issue with Intel's
38-
// GPU drivers. We manually normalize the dispatch group size to restore
39-
// performance.
40-
//
41-
// TODO: Revert this change once the driver issue is fixed.
42-
if (context.AdapterInfo().vendor == std::string_view{"intel"}) {
43-
ORT_ENFORCE(rank == static_cast<size_t>(4), "Input tensor must have rank 4.");
44-
dispatch_x = ceil_div(transposed_kernel_shape_vector[0] * transposed_kernel_shape_vector[1], 2);
45-
dispatch_y = ceil_div(transposed_kernel_shape_vector[2], 4);
46-
dispatch_z = ceil_div(transposed_kernel_shape_vector[3], 8);
47-
}
48-
4922
TensorShape transposed_kernel_shape(transposed_kernel_shape_vector);
5023
*transposed_kernel = context.CreateGPUTensor(kernel->DataType(), transposed_kernel_shape);
51-
bool use_shared = false;
52-
TransposeProgram program{perm, use_shared};
53-
program
54-
.CacheHint(absl::StrJoin(perm, "-"))
55-
.AddInput({kernel, ProgramTensorMetadataDependency::TypeAndRank, kernel_shape, 1})
56-
.AddOutput({transposed_kernel, ProgramTensorMetadataDependency::TypeAndRank})
57-
.AddUniformVariable({output_size})
58-
.SetWorkgroupSize(64)
59-
.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z);
60-
return context.RunProgram(program);
24+
const Tensor reshaped_kernel(kernel->DataType(), kernel_shape, const_cast<void*>(kernel->DataRaw()), kernel->Location());
25+
return Transpose::DoTranspose(context, perm, reshaped_kernel, *transposed_kernel);
6126
}
6227

6328
template <bool is_channels_last, bool is_fused>

onnxruntime/core/providers/webgpu/tensor/transpose.cc

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88
#include "core/providers/webgpu/shader_helper.h"
99
#include "core/providers/webgpu/webgpu_supported_types.h"
1010

11+
namespace {
12+
13+
inline uint32_t ceil_div(int64_t numerator, int32_t denominator) {
14+
return static_cast<uint32_t>((numerator + denominator - 1) / denominator);
15+
}
16+
17+
} // namespace
18+
1119
namespace onnxruntime {
1220
namespace webgpu {
1321

@@ -134,22 +142,39 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context,
134142
uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
135143
TransposeProgram program{permutations, use_shared};
136144

137-
if (use_shared) {
138-
program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
139-
}
140145
program
141146
.CacheHint(absl::StrJoin(permutations, "-"))
142147
.AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
143148
.AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
144-
.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
145-
static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
146149
.AddUniformVariables({
147150
{static_cast<uint32_t>(output_size)},
148151
});
149152

150-
use_shared ? program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
151-
static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
152-
: program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE);
153+
if (use_shared) {
154+
program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
155+
program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
156+
static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)));
157+
} else {
158+
program.SetWorkgroupSize(WORKGROUP_SIZE);
159+
160+
uint32_t dispatch_x = ceil_div(output_size, WORKGROUP_SIZE);
161+
uint32_t dispatch_y = 1;
162+
uint32_t dispatch_z = 1;
163+
164+
// This temporary workaround addresses a significant performance bottleneck
165+
// (~12x slower) for the shape (3, 3, 2560, 1280) due to an issue with Intel's
166+
// GPU drivers. We manually normalize the dispatch group size to restore
167+
// performance.
168+
//
169+
// TODO: Revert this change once the driver issue is fixed.
170+
if (context.AdapterInfo().vendor == std::string_view{"intel"}) {
171+
ORT_ENFORCE(rank == static_cast<size_t>(4), "Input tensor must have rank 4.");
172+
dispatch_x = ceil_div(input_shape[0] * input_shape[1], 2);
173+
dispatch_y = ceil_div(input_shape[2], 4);
174+
dispatch_z = ceil_div(input_shape[3], 8);
175+
}
176+
program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z);
177+
}
153178
return context.RunProgram(program);
154179
}
155180

0 commit comments

Comments
 (0)