modelscope · ji-huazhong · Nov 18, 2025 · Nov 19, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -9,6 +9,12 @@
 from typing import List, Optional, Tuple
 
 import peft
+try:
+    # Enable Megatron on Ascend NPU
+    import mindspeed.megatron_adaptor  # F401
+    HAS_MINDSPEED = True
+except ImportError:
+    HAS_MINDSPEED = False
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -710,6 +716,12 @@ def _patch_megatron():
 
 
 def init_megatron_env() -> None:
+    if HAS_MINDSPEED:
+        from mindspeed.megatron_adaptor import repatch
+
+        # a workaround for not finding transformer_engine on Ascend NPU
+        repatch({})
+
     if 'MEGATRON_LM_PATH' not in os.environ:
         # TODO: Synchronization issues may occur in DDP scenarios
         # if the distributed environment has not been initialized.

diff --git a/swift/megatron/train/sft.py b/swift/megatron/train/sft.py
@@ -1,10 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+from dataclasses import asdict
 from functools import partial
 from typing import List, Optional, Union
 
 import torch
 import torch.distributed as dist
+try:
+    # Enable Megatron on Ascend NPU
+    from mindspeed.megatron_adaptor import repatch
+except ImportError:
+    repatch = None
 
 from swift.llm import TEMPLATE_MAPPING
 from swift.llm.train import SwiftSft
@@ -28,6 +34,14 @@ def __init__(self, args: Optional[Union[List[str], MegatronTrainArguments]] = No
         self.train_msg = {}
         super(SwiftSft, self).__init__(args)
         args = self.args
+        if repatch is not None:
+            megatron_args = asdict(args)
+            if megatron_args["attention_backend"] != "local":
+                megatron_args["use_flash_attn"] = True
+                # MindSpeed requires passing `use_flash_attn` to Megatron
+                # to enable flash attention on Ascend NPU.
+                self.args.use_flash_attn = True
+            repatch(megatron_args)
-            megatron_args = asdict(args)
-            if megatron_args["attention_backend"] != "local":
-                megatron_args["use_flash_attn"] = True
-                # MindSpeed requires passing `use_flash_attn` to Megatron
-                # to enable flash attention on Ascend NPU.
-                self.args.use_flash_attn = True
-            repatch(megatron_args)
+            if args.attention_backend != "local":
+                # MindSpeed requires passing `use_flash_attn` to Megatron
+                # to enable flash attention on Ascend NPU.
+                args.use_flash_attn = True
+            megatron_args = asdict(args)
+            repatch(megatron_args)
-            megatron_args = asdict(args)
-            if megatron_args["attention_backend"] != "local":
-                megatron_args["use_flash_attn"] = True
-                # MindSpeed requires passing `use_flash_attn` to Megatron
-                # to enable flash attention on Ascend NPU.
-                self.args.use_flash_attn = True
-            repatch(megatron_args)
+            if args.attention_backend != "local":
+                # MindSpeed requires passing `use_flash_attn` to Megatron
+                # to enable flash attention on Ascend NPU.
+                args.use_flash_attn = True
+            megatron_args = asdict(args)
+            repatch(megatron_args)
         template_cls = TEMPLATE_MAPPING[args.template].template_cls
         if args.model_meta.is_multimodal and template_cls and template_cls.use_model:
             kwargs = {'return_dummy_model': True}

diff --git a/vl-moe.sh b/vl-moe.sh
@@ -0,0 +1,45 @@
+PYTORCH_NPU_ALLOC_CONF='expandable_segments:True' \
+NPROC_PER_NODE=4 \
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 \
+megatron sft \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --load_safetensors true \
+    --save_safetensors true \
+    --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --load_from_cache_file true \
+    --train_type full \
+    --freeze_llm false \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --split_dataset_ratio 0.01 \
+    --tensor_model_parallel_size 2 \
+    --sequence_parallel true \
+    --expert_model_parallel_size 2 \
+    --pipeline_model_parallel_size 2 \
+    --decoder_first_pipeline_num_layers 5 \
+    --moe_aux_loss_coeff 1e-3 \
+    --micro_batch_size 1 \
+    --global_batch_size 4 \
+    --finetune true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --max_epochs 1 \
+    --save megatron_output/Qwen3-VL-30B-A3B \
+    --eval_interval 200 \
+    --save_interval 200 \
+    --vit_gradient_checkpointing true \
+    --max_length 2048 \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --packing true \
+    --cross_entropy_loss_fusion true \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --attention_backend flash
+    # --moe_permute_fusion true \
-    # --moe_permute_fusion true \
+    # --moe_permute_fusion true
+
-    # --moe_permute_fusion true \
+    # --moe_permute_fusion true
+