weight update progress过于缓慢

单机单卡，每次weight update progress都十分缓慢，Qwen2.5-VL-3B大约10min而Qwen2.5-VL-7B大约20min，导致训练过程GPU基本处于空闲状态。

如果这不是正常情况，有什么方法可以加速吗？非常感谢

config照着examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml修改的：

```
defaults:
  - ../config/deepspeed_zero@_here_
  - ../config/deepspeed_zero2@_here_
  - ../config/deepspeed_zero3@_here_
  - ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "qwen2_5_vl_7B_custom_config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output

checkpoint_config:
  type: file_system
  output_dir: /fs/fast/ROLL/playground/models/${exp_name}

track_with: wandb
tracker_kwargs:
  log_dir: /fs/fast/ROLL/playground/wandb/roll_exp/rlvr_custom

save_steps: 10
logging_steps: 1
resume_from_checkpoint: false

rollout_batch_size: 32
num_return_sequences_in_group: 8
is_num_return_sequences_expand: true
prompt_length: 1536
response_length: 1024
generate_opt_level: 0

ppo_epochs: 1
value_clip: 0.5
reward_clip: 10
advantage_clip: 10.0
whiten_advantages: false
init_kl_coef: 0.0
adv_estimator: "grpo"
use_kl_loss: true
kl_loss_coef: 1.0e-2

pretrain: /fs/fast/pretrained_models/Qwen2.5-VL-7B-Instruct
# pretrain: /fs/fast/pretrained_models/Qwen2.5-VL-3B-Instruct

actor_train:
  model_args:
    freeze_module_prefix: visual.blocks,visual.patch_embed
    attn_implementation: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-5
    weight_decay: 1.0e-2
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 4
    warmup_steps: 0
    num_train_epochs: 2
  data_args:
    template: qwen2-vl
    file_name: /fs/fast/DATASETS/xxx
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: deepspeed_train
    strategy_config: ${deepspeed_zero3}
  device_mapping: list(range(3,4))
  infer_batch_size: 4

actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: ${num_return_sequences_in_group}
  data_args:
    template: qwen2-vl
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.8
      block_size: 16
      max_model_len: 2560
      enable_prefix_caching: false
  num_gpus_per_worker: 1
  device_mapping: list(range(3,4))
  infer_batch_size: 4

reference:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    # In transformers>=4.50.0, if model.from_pretrained with auto device_map, None
    # tp_plan (and tp_plan of model is not None) and WORLD_SIZE>1, TP would be used.
    # Thus using device_map=0 to disable HF transformers parallel, otherwise use
    # zero3 for reference model
    device_map: "cuda:0"
    model_type: ~
  data_args:
    template: qwen2-vl
  strategy_args:
    strategy_name: hf_infer
    strategy_config: ~
  device_mapping: list(range(3,4))
  infer_batch_size: 4

rewards:
  llm_judge:
    worker_cls: roll.pipeline.rlvr.rewards.custom_reward_worker.CustomRewardWorker
    judge_model_type: inference
    model_args:
      model_name_or_path: /fs/fast/pretrained_models/Qwen3-4B-Instruct-2507
      attn_implementation: fa2
      dtype: bf16
      model_type: ~
    data_args:
      template: qwen3
    strategy_args:
      strategy_name: hf_infer
      strategy_config: ~
    device_mapping: list(range(3,4))
    infer_batch_size: 4
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

weight update progress过于缓慢 #279

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

weight update progress过于缓慢 #279

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions