Skip to content

weight update progress过于缓慢 #279

@luyouqi233

Description

@luyouqi233

单机单卡,每次weight update progress都十分缓慢,Qwen2.5-VL-3B大约10min而Qwen2.5-VL-7B大约20min,导致训练过程GPU基本处于空闲状态。

如果这不是正常情况,有什么方法可以加速吗?非常感谢

config照着examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml修改的:

defaults:
  - ../config/deepspeed_zero@_here_
  - ../config/deepspeed_zero2@_here_
  - ../config/deepspeed_zero3@_here_
  - ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "qwen2_5_vl_7B_custom_config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output

checkpoint_config:
  type: file_system
  output_dir: /fs/fast/ROLL/playground/models/${exp_name}

track_with: wandb
tracker_kwargs:
  log_dir: /fs/fast/ROLL/playground/wandb/roll_exp/rlvr_custom

save_steps: 10
logging_steps: 1
resume_from_checkpoint: false

rollout_batch_size: 32
num_return_sequences_in_group: 8
is_num_return_sequences_expand: true
prompt_length: 1536
response_length: 1024
generate_opt_level: 0

ppo_epochs: 1
value_clip: 0.5
reward_clip: 10
advantage_clip: 10.0
whiten_advantages: false
init_kl_coef: 0.0
adv_estimator: "grpo"
use_kl_loss: true
kl_loss_coef: 1.0e-2

pretrain: /fs/fast/pretrained_models/Qwen2.5-VL-7B-Instruct
# pretrain: /fs/fast/pretrained_models/Qwen2.5-VL-3B-Instruct

actor_train:
  model_args:
    freeze_module_prefix: visual.blocks,visual.patch_embed
    attn_implementation: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-5
    weight_decay: 1.0e-2
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 4
    warmup_steps: 0
    num_train_epochs: 2
  data_args:
    template: qwen2-vl
    file_name: /fs/fast/DATASETS/xxx
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: deepspeed_train
    strategy_config: ${deepspeed_zero3}
  device_mapping: list(range(3,4))
  infer_batch_size: 4

actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: ${num_return_sequences_in_group}
  data_args:
    template: qwen2-vl
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.8
      block_size: 16
      max_model_len: 2560
      enable_prefix_caching: false
  num_gpus_per_worker: 1
  device_mapping: list(range(3,4))
  infer_batch_size: 4

reference:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    # In transformers>=4.50.0, if model.from_pretrained with auto device_map, None
    # tp_plan (and tp_plan of model is not None) and WORLD_SIZE>1, TP would be used.
    # Thus using device_map=0 to disable HF transformers parallel, otherwise use
    # zero3 for reference model
    device_map: "cuda:0"
    model_type: ~
  data_args:
    template: qwen2-vl
  strategy_args:
    strategy_name: hf_infer
    strategy_config: ~
  device_mapping: list(range(3,4))
  infer_batch_size: 4

rewards:
  llm_judge:
    worker_cls: roll.pipeline.rlvr.rewards.custom_reward_worker.CustomRewardWorker
    judge_model_type: inference
    model_args:
      model_name_or_path: /fs/fast/pretrained_models/Qwen3-4B-Instruct-2507
      attn_implementation: fa2
      dtype: bf16
      model_type: ~
    data_args:
      template: qwen3
    strategy_args:
      strategy_name: hf_infer
      strategy_config: ~
    device_mapping: list(range(3,4))
    infer_batch_size: 4

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions