|
8 | 8 | #include "core/providers/webgpu/shader_helper.h" |
9 | 9 | #include "core/providers/webgpu/webgpu_supported_types.h" |
10 | 10 |
|
| 11 | +namespace { |
| 12 | + |
| 13 | +inline uint32_t ceil_div(int64_t numerator, int32_t denominator) { |
| 14 | + return static_cast<uint32_t>((numerator + denominator - 1) / denominator); |
| 15 | +} |
| 16 | + |
| 17 | +} // namespace |
| 18 | + |
11 | 19 | namespace onnxruntime { |
12 | 20 | namespace webgpu { |
13 | 21 |
|
@@ -134,22 +142,39 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContext& context, |
134 | 142 | uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size()); |
135 | 143 | TransposeProgram program{permutations, use_shared}; |
136 | 144 |
|
137 | | - if (use_shared) { |
138 | | - program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1); |
139 | | - } |
140 | 145 | program |
141 | 146 | .CacheHint(absl::StrJoin(permutations, "-")) |
142 | 147 | .AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}}) |
143 | 148 | .AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}}) |
144 | | - .SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), |
145 | | - static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) |
146 | 149 | .AddUniformVariables({ |
147 | 150 | {static_cast<uint32_t>(output_size)}, |
148 | 151 | }); |
149 | 152 |
|
150 | | - use_shared ? program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), |
151 | | - static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) |
152 | | - : program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE); |
| 153 | + if (use_shared) { |
| 154 | + program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1); |
| 155 | + program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), |
| 156 | + static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))); |
| 157 | + } else { |
| 158 | + program.SetWorkgroupSize(WORKGROUP_SIZE); |
| 159 | + |
| 160 | + uint32_t dispatch_x = ceil_div(output_size, WORKGROUP_SIZE); |
| 161 | + uint32_t dispatch_y = 1; |
| 162 | + uint32_t dispatch_z = 1; |
| 163 | + |
| 164 | + // This temporary workaround addresses a significant performance bottleneck |
| 165 | + // (~12x slower) for the shape (3, 3, 2560, 1280) due to an issue with Intel's |
| 166 | + // GPU drivers. We manually normalize the dispatch group size to restore |
| 167 | + // performance. |
| 168 | + // |
| 169 | + // TODO: Revert this change once the driver issue is fixed. |
| 170 | + if (context.AdapterInfo().vendor == std::string_view{"intel"}) { |
| 171 | + ORT_ENFORCE(rank == static_cast<size_t>(4), "Input tensor must have rank 4."); |
| 172 | + dispatch_x = ceil_div(input_shape[0] * input_shape[1], 2); |
| 173 | + dispatch_y = ceil_div(input_shape[2], 4); |
| 174 | + dispatch_z = ceil_div(input_shape[3], 8); |
| 175 | + } |
| 176 | + program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z); |
| 177 | + } |
153 | 178 | return context.RunProgram(program); |
154 | 179 | } |
155 | 180 |
|
|
0 commit comments