-
Notifications
You must be signed in to change notification settings - Fork 172
Open
Description
单机单卡,每次weight update progress都十分缓慢,Qwen2.5-VL-3B大约10min而Qwen2.5-VL-7B大约20min,导致训练过程GPU基本处于空闲状态。
如果这不是正常情况,有什么方法可以加速吗?非常感谢
config照着examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml修改的:
defaults:
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_
hydra:
run:
dir: .
output_subdir: null
exp_name: "qwen2_5_vl_7B_custom_config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
checkpoint_config:
type: file_system
output_dir: /fs/fast/ROLL/playground/models/${exp_name}
track_with: wandb
tracker_kwargs:
log_dir: /fs/fast/ROLL/playground/wandb/roll_exp/rlvr_custom
save_steps: 10
logging_steps: 1
resume_from_checkpoint: false
rollout_batch_size: 32
num_return_sequences_in_group: 8
is_num_return_sequences_expand: true
prompt_length: 1536
response_length: 1024
generate_opt_level: 0
ppo_epochs: 1
value_clip: 0.5
reward_clip: 10
advantage_clip: 10.0
whiten_advantages: false
init_kl_coef: 0.0
adv_estimator: "grpo"
use_kl_loss: true
kl_loss_coef: 1.0e-2
pretrain: /fs/fast/pretrained_models/Qwen2.5-VL-7B-Instruct
# pretrain: /fs/fast/pretrained_models/Qwen2.5-VL-3B-Instruct
actor_train:
model_args:
freeze_module_prefix: visual.blocks,visual.patch_embed
attn_implementation: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 1.0e-5
weight_decay: 1.0e-2
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
warmup_steps: 0
num_train_epochs: 2
data_args:
template: qwen2-vl
file_name: /fs/fast/DATASETS/xxx
preprocessing_num_workers: 16
strategy_args:
strategy_name: deepspeed_train
strategy_config: ${deepspeed_zero3}
device_mapping: list(range(3,4))
infer_batch_size: 4
actor_infer:
model_args:
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${response_length}
top_p: 0.99
top_k: 100
num_beams: 1
temperature: 0.99
num_return_sequences: ${num_return_sequences_in_group}
data_args:
template: qwen2-vl
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.8
block_size: 16
max_model_len: 2560
enable_prefix_caching: false
num_gpus_per_worker: 1
device_mapping: list(range(3,4))
infer_batch_size: 4
reference:
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: true
dtype: bf16
# In transformers>=4.50.0, if model.from_pretrained with auto device_map, None
# tp_plan (and tp_plan of model is not None) and WORLD_SIZE>1, TP would be used.
# Thus using device_map=0 to disable HF transformers parallel, otherwise use
# zero3 for reference model
device_map: "cuda:0"
model_type: ~
data_args:
template: qwen2-vl
strategy_args:
strategy_name: hf_infer
strategy_config: ~
device_mapping: list(range(3,4))
infer_batch_size: 4
rewards:
llm_judge:
worker_cls: roll.pipeline.rlvr.rewards.custom_reward_worker.CustomRewardWorker
judge_model_type: inference
model_args:
model_name_or_path: /fs/fast/pretrained_models/Qwen3-4B-Instruct-2507
attn_implementation: fa2
dtype: bf16
model_type: ~
data_args:
template: qwen3
strategy_args:
strategy_name: hf_infer
strategy_config: ~
device_mapping: list(range(3,4))
infer_batch_size: 4
Metadata
Metadata
Assignees
Labels
No labels