Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions swift/megatron/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
from typing import List, Optional, Tuple

import peft
try:
# Enable Megatron on Ascend NPU
import mindspeed.megatron_adaptor # F401
HAS_MINDSPEED = True
except ImportError:
HAS_MINDSPEED = False
import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down Expand Up @@ -710,6 +716,12 @@ def _patch_megatron():


def init_megatron_env() -> None:
if HAS_MINDSPEED:
from mindspeed.megatron_adaptor import repatch

# a workaround for not finding transformer_engine on Ascend NPU
repatch({})

if 'MEGATRON_LM_PATH' not in os.environ:
# TODO: Synchronization issues may occur in DDP scenarios
# if the distributed environment has not been initialized.
Expand Down
14 changes: 14 additions & 0 deletions swift/megatron/train/sft.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from dataclasses import asdict
from functools import partial
from typing import List, Optional, Union

import torch
import torch.distributed as dist
try:
# Enable Megatron on Ascend NPU
from mindspeed.megatron_adaptor import repatch
except ImportError:
repatch = None

from swift.llm import TEMPLATE_MAPPING
from swift.llm.train import SwiftSft
Expand All @@ -28,6 +34,14 @@ def __init__(self, args: Optional[Union[List[str], MegatronTrainArguments]] = No
self.train_msg = {}
super(SwiftSft, self).__init__(args)
args = self.args
if repatch is not None:
megatron_args = asdict(args)
if megatron_args["attention_backend"] != "local":
megatron_args["use_flash_attn"] = True
# MindSpeed requires passing `use_flash_attn` to Megatron
# to enable flash attention on Ascend NPU.
self.args.use_flash_attn = True
repatch(megatron_args)
Comment on lines +38 to +44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for setting use_flash_attn can be simplified for better clarity and to avoid redundancy. It's clearer to modify args.use_flash_attn directly and then create the megatron_args dictionary from the updated args object. This avoids modifying both the dictionary and the args object separately for the same value.

Suggested change
megatron_args = asdict(args)
if megatron_args["attention_backend"] != "local":
megatron_args["use_flash_attn"] = True
# MindSpeed requires passing `use_flash_attn` to Megatron
# to enable flash attention on Ascend NPU.
self.args.use_flash_attn = True
repatch(megatron_args)
if args.attention_backend != "local":
# MindSpeed requires passing `use_flash_attn` to Megatron
# to enable flash attention on Ascend NPU.
args.use_flash_attn = True
megatron_args = asdict(args)
repatch(megatron_args)

template_cls = TEMPLATE_MAPPING[args.template].template_cls
if args.model_meta.is_multimodal and template_cls and template_cls.use_model:
kwargs = {'return_dummy_model': True}
Expand Down
45 changes: 45 additions & 0 deletions vl-moe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
PYTORCH_NPU_ALLOC_CONF='expandable_segments:True' \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is recommended to place the examples in the examples/npu/megatron directory.

In the future, the npu directory can provide more NPU-related scripts.

It is recommended to add a section on NPU environment installation in the Megatron-SWIFT Quick Start documentation.

NPROC_PER_NODE=4 \
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 \
megatron sft \
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
--load_safetensors true \
--save_safetensors true \
--dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
--load_from_cache_file true \
--train_type full \
--freeze_llm false \
--freeze_vit true \
--freeze_aligner true \
--split_dataset_ratio 0.01 \
--tensor_model_parallel_size 2 \
--sequence_parallel true \
--expert_model_parallel_size 2 \
--pipeline_model_parallel_size 2 \
--decoder_first_pipeline_num_layers 5 \
--moe_aux_loss_coeff 1e-3 \
--micro_batch_size 1 \
--global_batch_size 4 \
--finetune true \
--lr 1e-4 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-5 \
--max_epochs 1 \
--save megatron_output/Qwen3-VL-30B-A3B \
--eval_interval 200 \
--save_interval 200 \
--vit_gradient_checkpointing true \
--max_length 2048 \
--num_workers 8 \
--dataset_num_proc 8 \
--no_save_optim true \
--no_save_rng true \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--packing true \
--cross_entropy_loss_fusion true \
--recompute_granularity full \
--recompute_method uniform \
--recompute_num_layers 1 \
--attention_backend flash
# --moe_permute_fusion true \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This commented-out line includes an unnecessary trailing backslash. Additionally, the file is missing a newline character at the end. According to POSIX standards, a text file should conclude with a newline to ensure compatibility with various command-line tools.

Suggested change
# --moe_permute_fusion true \
# --moe_permute_fusion true

Loading