NVIDIA · jberchtold-nvidia · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
@@ -27,6 +27,7 @@
 from transformer_engine.jax.cpp_extensions.quantization import (
     _jax_quantize,
     _jax_quantize_dbias,
+    GroupedQuantizePrimitive,
 )
 from transformer_engine.jax.cpp_extensions.misc import get_cudnn_version
 from transformer_engine.jax import cpp_extensions as tex
@@ -1068,7 +1069,20 @@ def test_rht_gemm(self, in_dtype, q_dtype, scaling_mode, m, n, k, data_layout, w
 
 @pytest.mark.skipif(not is_fp8_supported, reason=fp8_unsupported_reason)
 @pytest_parametrize_wrapper("in_dtype", QUANTIZATION_INPUT_DTYPE)
-@pytest_parametrize_wrapper("input_shape", [(8, 16, 32)])
+@pytest_parametrize_wrapper(
+    "input_shape",
+    [
+        (8, 16, 32),  # V1 MXFP8: K=32 not 128-aligned
+        (4, 8, 128),  # V2 MXFP8 eligible: K=128, M*32=256 both 128-aligned
+    ],
+)
+@pytest_parametrize_wrapper(
+    "group_size_multiplier",
+    [
+        32,  # V1 MXFP8: group size must be multiple of 32
+        128,  # V2 MXFP8 eligible: group size must be multiple of 128
+    ],
+)
 @pytest_parametrize_wrapper("q_dtype", [jnp.float8_e4m3fn])
 @pytest_parametrize_wrapper("scaling_mode", non_fp4_supported_scaling_modes)
 @pytest_parametrize_wrapper("flatten_axis", [-1])
@@ -1078,29 +1092,52 @@ def test_rht_gemm(self, in_dtype, q_dtype, scaling_mode, m, n, k, data_layout, w
 )
 class TestGroupedQuantize:
     def test_grouped_qdq(
-        self, in_dtype, input_shape, q_dtype, scaling_mode, q_layout, flatten_axis, with_group_sizes
+        self,
+        in_dtype,
+        input_shape,
+        group_size_multiplier,
+        q_dtype,
+        scaling_mode,
+        q_layout,
+        flatten_axis,
+        with_group_sizes,
     ):
         n_groups, m, n = input_shape
         key = jax.random.PRNGKey(0)
         subkeys = jax.random.split(key, 2)
 
-        # *32 so that the input shapes works for MXFP8
-        input_shape = (m * 32, n)
+        input_shape = (m * group_size_multiplier, n)
 
         if with_group_sizes:
             group_sizes = jnp.sort(jax.random.randint(subkeys[0], (n_groups - 1,), 0, m))
             group_sizes = jnp.concatenate([jnp.array([0]), group_sizes, jnp.array([m])])
             group_sizes = jnp.diff(group_sizes)
             assert group_sizes.sum() == m
             assert jnp.any(group_sizes == 0)  # make sure that at least one group has 0 row
-            group_sizes = group_sizes * 32
+            group_sizes = group_sizes * group_size_multiplier
         else:
             group_sizes = None
             input_shape = (n_groups, input_shape[0] // n_groups, input_shape[1])
 
         if flatten_axis == -2:
             input_shape = input_shape[:-1] + (2,) + input_shape[-1:]
 
+        # V2 MXFP8 quantize requires every individual group size to be a multiple of 128.
+        # group_size_multiplier=32 can produce groups of 32 or 64 rows which violate this.
+        # This cannot be checked at runtime (group sizes live on device), so we skip the
+        # test configuration rather than weaken the kernel-selection logic.
+        if (
+            scaling_mode == ScalingMode.MXFP8_1D_SCALING
+            and group_size_multiplier % 128 != 0
+            and GroupedQuantizePrimitive._use_v2_kernel(
+                scaling_mode.value, input_shape, flatten_axis
+            )
+        ):
+            pytest.skip(
+                "MXFP8 V2 quantize requires each group to be 128-aligned; "
+                f"group_size_multiplier={group_size_multiplier} may produce smaller groups"
+            )
+
         x = jax.random.uniform(subkeys[1], input_shape, in_dtype)
 
         grouped_quantizer = QuantizerFactory.create(
@@ -1713,10 +1750,11 @@ def ref_func(x, gamma, kernel_1, kernel_2, bias_1, bias_2):
 ]
 
 GROUPED_DENSE_INPUT_SHAPES = [
-    # (n_groups, m, n, k), the actual m will be multiplied by 32
-    (5, 32, 128, 64),  # Test the case where n_groups is not a multiple of 4
-    (8, 64, 32, 128),
-    (8, 64, 128, 256),
+    # (n_groups, m, n, k), the actual m will be multiplied by group_size_multiplier
+    (5, 32, 128, 64),  # V1 MXFP8: K=64 not 128-aligned; also tests n_groups not a multiple of 4
+    (8, 64, 32, 128),  # V1 MXFP8 GEMM: N=32 not 128-aligned
+    (8, 64, 128, 256),  # V2 MXFP8 eligible: K=256, N=128 both 128-aligned
+    (4, 4, 128, 128),  # V2 MXFP8 eligible: K=128, N=128 both 128-aligned (smaller shape)
 ]
 
 
@@ -1742,7 +1780,9 @@ def _ref_grouped_dense(self, lhs, rhs, bias, group_sizes, contracting_dims):
             ref_out.append(jnp.squeeze(out_i))
         return ref_out
 
-    def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", with_bias=False):
+    def _generate_grouped_dense_input(
+        self, dtype, input_shape, data_layout="NN", with_bias=False, group_size_multiplier=32
+    ):
         key = jax.random.PRNGKey(0)
         subkeys = jax.random.split(key, 4)
         n_groups, m, n, k = input_shape
@@ -1755,9 +1795,9 @@ def _generate_grouped_dense_input(self, dtype, input_shape, data_layout="NN", wi
         group_sizes = group_sizes.at[1].set(0)
         assert group_sizes.sum() == m
 
-        # *32 to make sure that input shape works for MXFP8
-        group_sizes = group_sizes * 32
-        m = m * 32
+        # Scale group sizes by the multiplier for alignment requirements.
+        group_sizes = group_sizes * group_size_multiplier
+        m = m * group_size_multiplier
 
         lhs_shape = (m if data_layout[0] == "N" else k, k if data_layout[0] == "N" else m)
         rhs_shape = (n_groups, k if data_layout[1] == "N" else n, n if data_layout[1] == "N" else k)
@@ -1831,8 +1871,10 @@ def test_grouped_gemm_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape, layout
             quantizer.q_dtype = bwd_dtype
 
         out_dtype = jnp.bfloat16
+        # MXFP8 V2 kernel requires each group's row count to be divisible by 128.
+        is_mxfp8 = scaling_mode == ScalingMode.MXFP8_1D_SCALING
         lhs, rhs, group_sizes, contracting_dims, _ = self._generate_grouped_dense_input(
-            out_dtype, input_shape, layout
+            out_dtype, input_shape, layout, group_size_multiplier=128 if is_mxfp8 else 32
         )
         ref_out = self._ref_grouped_dense(lhs, rhs, None, group_sizes, contracting_dims)
 
@@ -1906,10 +1948,13 @@ def test_grouped_dense_grad_fp16(self, dtype, input_shape):
     def test_grouped_dense_grad_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape):
         fwd_dtype, bwd_dtype = fwd_bwd_dtype
         dtype = jnp.bfloat16
+        # MXFP8 V2 kernel requires each group's row count to be divisible by 128.
+        is_mxfp8 = scaling_mode == ScalingMode.MXFP8_1D_SCALING
         x, kernel, group_sizes, contracting_dims, bias = self._generate_grouped_dense_input(
             dtype,
             input_shape,
             with_bias=True,
+            group_size_multiplier=128 if is_mxfp8 else 32,
         )
 
         quantizer_set = QuantizerFactory.create_set(

diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu
@@ -1414,6 +1414,24 @@ __global__ void convert_int32_to_int64_kernel(const int32_t *src, int64_t *dst,
   if (idx < n) dst[idx] = static_cast<int64_t>(src[idx]);
 }
 
+// Like convert_int32_to_int64_kernel but scales each element by multiplier.
+// Used to convert per-expert slice counts to per-expert row counts for multi-dim tensors.
+__global__ void convert_int32_to_int64_with_multiplier_kernel(const int32_t *src, int64_t *dst,
+                                                              size_t n, int64_t multiplier) {
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) dst[idx] = static_cast<int64_t>(src[idx]) * multiplier;
+}
+
+// Computes exclusive prefix sums: offsets[0]=0, offsets[i]=sum(first_dims[0..i-1]*last_dim).
+// Produces n_groups+1 values. Single-threaded sequential scan; n_groups is typically small.
+__global__ void compute_grouped_tensor_offsets_kernel(const int64_t *first_dims, int64_t *offsets,
+                                                      size_t n_groups, int64_t last_dim) {
+  offsets[0] = 0;
+  for (size_t i = 0; i < n_groups; i++) {
+    offsets[i + 1] = offsets[i] + first_dims[i] * last_dim;
+  }
+}
+
 }  // namespace
 
 void nvte_convert_int32_to_int64(const int32_t *src, int64_t *dst, size_t n, cudaStream_t stream) {
@@ -1424,3 +1442,23 @@ void nvte_convert_int32_to_int64(const int32_t *src, int64_t *dst, size_t n, cud
   convert_int32_to_int64_kernel<<<blocks, threads, 0, stream>>>(src, dst, n);
   NVTE_CHECK_CUDA(cudaGetLastError());
 }
+
+void nvte_convert_int32_to_int64_with_multiplier(const int32_t *src, int64_t *dst, size_t n,
+                                                 int64_t multiplier, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_convert_int32_to_int64_with_multiplier);
+  if (n == 0) return;
+  const int threads = 256;
+  const int blocks = static_cast<int>((n + threads - 1) / threads);
+  convert_int32_to_int64_with_multiplier_kernel<<<blocks, threads, 0, stream>>>(src, dst, n,
+                                                                                multiplier);
+  NVTE_CHECK_CUDA(cudaGetLastError());
+}
+
+void nvte_compute_grouped_tensor_offsets(const int64_t *first_dims, int64_t *offsets,
+                                         size_t n_groups, int64_t last_dim, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_compute_grouped_tensor_offsets);
+  // Always write at least offsets[0]=0 (needed even for n_groups==0).
+  compute_grouped_tensor_offsets_kernel<<<1, 1, 0, stream>>>(first_dims, offsets, n_groups,
+                                                             last_dim);
+  NVTE_CHECK_CUDA(cudaGetLastError());
+}
diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h
@@ -356,6 +356,35 @@ size_t nvte_get_grouped_gemm_setup_workspace_size(size_t num_tensors);
  */
 void nvte_convert_int32_to_int64(const int32_t *src, int64_t *dst, size_t n, cudaStream_t stream);
 
+/*! \brief Convert int32 array to int64 while scaling each element by a multiplier.
+ *
+ *  Computes dst[i] = (int64_t)src[i] * multiplier for each i in [0, n).
+ *  CUDA-graph safe (no host-device synchronization).
+ *
+ *  \param[in]  src         Device pointer to source int32 array.
+ *  \param[out] dst         Device pointer to destination int64 array.
+ *  \param[in]  n           Number of elements.
+ *  \param[in]  multiplier  Scale factor applied to each element.
+ *  \param[in]  stream      CUDA stream.
+ */
+void nvte_convert_int32_to_int64_with_multiplier(const int32_t *src, int64_t *dst, size_t n,
+                                                 int64_t multiplier, cudaStream_t stream);
+
+/*! \brief Compute exclusive prefix-sum offsets from per-group first-dimension sizes.
+ *
+ *  Writes n_groups+1 values to offsets: offsets[0]=0,
+ *  offsets[i] = sum(first_dims[0..i-1] * last_dim) for i in [1, n_groups].
+ *  This is CUDA-graph safe (no host-device synchronization).
+ *
+ *  \param[in]  first_dims  Device pointer to int64 array of length n_groups.
+ *  \param[out] offsets     Device pointer to int64 array of length n_groups+1.
+ *  \param[in]  n_groups    Number of groups.
+ *  \param[in]  last_dim    Common last dimension (number of columns).
+ *  \param[in]  stream      CUDA stream.
+ */
+void nvte_compute_grouped_tensor_offsets(const int64_t *first_dims, int64_t *offsets,
+                                         size_t n_groups, int64_t last_dim, cudaStream_t stream);
+
 void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb,
                        const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha,
                        const NVTETensor beta, NVTETensor workspace_setup,