|
3 | 3 | import copy |
4 | 4 | import bisect |
5 | 5 | from typing import Optional |
| 6 | +from tqdm import tqdm |
6 | 7 | from lightllm.utils.log_utils import init_logger |
7 | 8 | from lightllm.utils.envs_utils import get_env_start_args |
8 | 9 | from lightllm.distributed import dist_group_manager, lightllm_capture_graph, CustomProcessGroup |
@@ -191,7 +192,12 @@ def warmup(self, model): |
191 | 192 | model: TpPartBaseModel = model |
192 | 193 |
|
193 | 194 | # decode cuda graph init |
194 | | - for batch_size in self.cuda_graph_batch_sizes[::-1]: |
| 195 | + progress_bar = tqdm(self.cuda_graph_batch_sizes[::-1], desc="Capturing CUDA graphs") |
| 196 | + for batch_size in progress_bar: |
| 197 | + # Get available memory info |
| 198 | + avail_mem, total_mem = torch.cuda.mem_get_info() |
| 199 | + avail_mem_gb = avail_mem / (1024**3) |
| 200 | + progress_bar.set_description(f"Capturing CUDA graphs - Batch: {batch_size}, AvailMem: {avail_mem_gb:.2f}GB") |
195 | 201 | seq_len = 2 |
196 | 202 | total_token_num = batch_size * seq_len |
197 | 203 | max_len_in_batch = self.graph_max_len_in_batch |
@@ -246,7 +252,12 @@ def warmup_overlap(self, model): |
246 | 252 |
|
247 | 253 | model: TpPartBaseModel = model |
248 | 254 |
|
249 | | - for batch_size in self.cuda_graph_batch_sizes[::-1]: |
| 255 | + progress_bar = tqdm(self.cuda_graph_batch_sizes[::-1], desc="Capturing overlap CUDA graphs") |
| 256 | + for batch_size in progress_bar: |
| 257 | + # Get available memory info |
| 258 | + avail_mem, total_mem = torch.cuda.mem_get_info() |
| 259 | + avail_mem_gb = avail_mem / (1024**3) |
| 260 | + progress_bar.set_description(f"Capturing overlap CUDA graphs - Batch: {batch_size}, AvailMem: {avail_mem_gb:.2f}GB") |
250 | 261 | decode_batches = [] |
251 | 262 | for micro_batch_index in [0, 1]: |
252 | 263 | # dummy decoding, capture the cudagraph |
|
0 commit comments