[megatron] fix: make bridge exported cloned weights store on CPU

HollowMan6 · HollowMan6 · commit 063e48cfefa3 · 2025-11-22T00:20:13.000+02:00
There are so many `clone` in the GPTBridge code, if we make a
copy of the weight on GPU, OOM can easily happen. This PR tries
to address this issue by using cpu clone as a solution, so that
those weight will be cloned to CPU.

This PR also add `torch.cuda.empty_cache` to mitigate the possibility
of OOM during LoRA merge.

Signed-off-by: Hollow Man &lt;hollowman@opensuse.org&gt;
diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py
@@ -157,6 +157,7 @@ def _set_weight(
                 group=tp_group,
             )
             del splited_weights
+            torch.cuda.empty_cache()
         else:
             tensor = hf_weight
         if offset:
@@ -243,6 +244,7 @@ def _get_weight(self, mg_weight: torch.Tensor, mg_key: Optional[str], offset: fl
                 )
                 tensor = torch.cat(output, dim=tp_dim)
             del output
+            torch.cuda.empty_cache()
         # pp/ep
         if pp_size > 1:
             src_rank = torch.tensor([0 if tensor is None else pp_rank], dtype=torch.int64, device='cuda')
@@ -273,6 +275,22 @@ def _get_weight(self, mg_weight: torch.Tensor, mg_key: Optional[str], offset: fl
             tensor = None
         return tensor
 
+    def _cpu_clone(self, tensor: Optional[torch.Tensor]):
+        if tensor is None:
+            return None
+        if not isinstance(tensor, torch.Tensor):
+            return tensor
+        # Detach to avoid any autograd references
+        t = tensor.detach()
+        if t.device.type != 'cpu':
+            # Move to CPU if not already (this will make a copy for sure)
+            # `non_blocking=True` attempts an asynchronous copy for GPU->CPU when destination is
+            # pinned memory; this is best-effort and will fall back to blocking if not possible.
+            # https://docs.pytorch.org/tutorials/intermediate/pinmem_nonblock.html
+            return t.to('cpu', non_blocking=True)
+        else:
+            return t.clone()
+
     def _set_state_dict(self,
                         mg_module,
                         mg_key: str,
@@ -412,25 +430,28 @@ def _set_attn_state(self, mg_attn, hf_state_dict, hf_prefix: str, layer_idx: int
                 if lora_A is not None:
                     self._peft_target_modules.update({'q_proj', 'k_proj', 'v_proj'})
                     for key in ['q_proj', 'k_proj', 'v_proj']:
-                        hf_state_dict[f'{key}.lora_A.weight'] = lora_A.clone()
+                        hf_state_dict[f'{key}.lora_A.weight'] = self._cpu_clone(lora_A)
                     lora_B = lora_B.reshape((num_query_groups, -1, lora_B.shape[-1]))
-                    hf_state_dict['q_proj.lora_B.weight'] = lora_B[:, :q_dim, :].reshape(-1, lora_B.shape[-1]).clone()
-                    hf_state_dict['k_proj.lora_B.weight'] = lora_B[:,
-                                                                   q_dim:-kv_dim, :].reshape(-1,
-                                                                                             lora_B.shape[-1]).clone()
-                    hf_state_dict['v_proj.lora_B.weight'] = lora_B[:, -kv_dim:, :].reshape(-1, lora_B.shape[-1]).clone()
+                    hf_state_dict['q_proj.lora_B.weight'] = self._cpu_clone(lora_B[:, :q_dim, :].reshape(
+                        -1, lora_B.shape[-1]))
+                    hf_state_dict['k_proj.lora_B.weight'] = self._cpu_clone(lora_B[:, q_dim:-kv_dim, :].reshape(
+                        -1, lora_B.shape[-1]))
+                    hf_state_dict['v_proj.lora_B.weight'] = self._cpu_clone(lora_B[:, -kv_dim:, :].reshape(
+                        -1, lora_B.shape[-1]))
+                torch.cuda.empty_cache()
             elif not self._is_peft_format:
                 mg_attn_weight = self._get_weight(None if mg_attn is None else mg_attn.linear_qkv.weight.data,
                                                   'linear_qkv.weight')
                 if mg_attn_weight is not None:
                     mg_attn_weight = mg_attn_weight.reshape((num_query_groups, -1, args.hidden_size))
-                    hf_state_dict['q_proj.weight'] = mg_attn_weight[:, :q_dim, :].reshape(-1, args.hidden_size).clone()
-                    hf_state_dict['k_proj.weight'] = mg_attn_weight[:,
-                                                                    q_dim:-kv_dim, :].reshape(-1,
-                                                                                              args.hidden_size).clone()
-                    hf_state_dict['v_proj.weight'] = mg_attn_weight[:, -kv_dim:, :].reshape(-1,
-                                                                                            args.hidden_size).clone()
+                    hf_state_dict['q_proj.weight'] = self._cpu_clone(mg_attn_weight[:, :q_dim, :].reshape(
+                        -1, args.hidden_size))
+                    hf_state_dict['k_proj.weight'] = self._cpu_clone(mg_attn_weight[:, q_dim:-kv_dim, :].reshape(
+                        -1, args.hidden_size))
+                    hf_state_dict['v_proj.weight'] = self._cpu_clone(mg_attn_weight[:, -kv_dim:, :].reshape(
+                        -1, args.hidden_size))
                 del mg_attn_weight
+                torch.cuda.empty_cache()
         self._set_state_dict(mg_attn, 'linear_proj.weight', hf_state_dict, 'o_proj.weight', to_mcore)
 
         # Copy bias
@@ -448,9 +469,9 @@ def _set_attn_state(self, mg_attn, hf_state_dict, hf_prefix: str, layer_idx: int
                                                 'linear_qkv.bias')
                 if mg_attn_bias is not None:
                     mg_attn_bias = mg_attn_bias.reshape((num_query_groups, -1))
-                    hf_state_dict['q_proj.bias'] = mg_attn_bias[:, :q_dim].reshape(-1).clone()
-                    hf_state_dict['k_proj.bias'] = mg_attn_bias[:, q_dim:-kv_dim].reshape(-1).clone()
-                    hf_state_dict['v_proj.bias'] = mg_attn_bias[:, -kv_dim:].reshape(-1).clone()
+                    hf_state_dict['q_proj.bias'] = self._cpu_clone(mg_attn_bias[:, :q_dim].reshape(-1))
+                    hf_state_dict['k_proj.bias'] = self._cpu_clone(mg_attn_bias[:, q_dim:-kv_dim].reshape(-1))
+                    hf_state_dict['v_proj.bias'] = self._cpu_clone(mg_attn_bias[:, -kv_dim:].reshape(-1))
         if args.qk_layernorm:
             hf_q_norm_key = 'q_norm.weight' if hasattr(hf_attn, 'q_norm') else 'query_layernorm.weight'
             hf_k_norm_key = 'k_norm.weight' if hasattr(hf_attn, 'k_norm') else 'key_layernorm.weight'
@@ -626,6 +647,7 @@ def _set_mlp_state(self,
                             weight_list.append(torch.stack([gate_proj_weight, up_proj_weight], dim=0))
                         gate_up_proj_weight = torch.concat(weight_list, dim=0)
                         del weight_list
+                        torch.cuda.empty_cache()
                     else:
                         gate_proj_weight = hf_state_dict['gate_proj.weight'].load()
                         up_proj_weight = hf_state_dict['up_proj.weight'].load()
@@ -637,6 +659,7 @@ def _set_mlp_state(self,
                         getattr(mg_mlp.linear_fc1,
                                 f'weight{i}').data.copy_(fc1_weight[i].view(-1, fc1_weight.shape[-1]))
                     del fc1_weight
+                    torch.cuda.empty_cache()
                 else:
                     mg_mlp.linear_fc1.weight.data.copy_(fc1_weight.view(-1, fc1_weight.shape[-1]))
         else:
@@ -678,28 +701,33 @@ def _set_mlp_state(self,
                             lora_B = lora_B.view(num_local_experts, -1, lora_B.shape[-1])
                             for i in range(num_local_experts):
                                 hf_i = i + ep_rank * num_local_experts
-                                hf_state_dict[f'{hf_i}.gate_up_proj.lora_A.weight'] = lora_A[i].clone()
-                                hf_state_dict[f'{hf_i}.gate_up_proj.lora_B.weight'] = lora_B[i].clone()
+                                hf_state_dict[f'{hf_i}.gate_up_proj.lora_A.weight'] = self._cpu_clone(lora_A[i])
+                                hf_state_dict[f'{hf_i}.gate_up_proj.lora_B.weight'] = self._cpu_clone(lora_B[i])
+                            torch.cuda.empty_cache()
 
                         else:
-                            hf_state_dict['gate_up_proj.lora_A.weight'] = lora_A.clone()
-                            hf_state_dict['gate_up_proj.lora_B.weight'] = lora_B.view(-1, lora_B.shape[-1]).clone()
+                            hf_state_dict['gate_up_proj.lora_A.weight'] = self._cpu_clone(lora_A)
+                            hf_state_dict['gate_up_proj.lora_B.weight'] = self._cpu_clone(
+                                lora_B.view(-1, lora_B.shape[-1]))
+                            torch.cuda.empty_cache()
                     else:
                         self._peft_target_modules.update({'gate_proj', 'up_proj'})
                         if is_expert:
                             lora_A = lora_A.view(num_local_experts, -1, lora_A.shape[-1])
                             lora_B = lora_B.view(num_local_experts, 2, -1, lora_B.shape[-1])
                             for i in range(num_local_experts):
                                 hf_i = i + ep_rank * num_local_experts
-                                hf_state_dict[f'{hf_i}.gate_proj.lora_A.weight'] = lora_A[i].clone()
-                                hf_state_dict[f'{hf_i}.up_proj.lora_A.weight'] = lora_A[i].clone()
-                                hf_state_dict[f'{hf_i}.gate_proj.lora_B.weight'] = lora_B[i][0].clone()
-                                hf_state_dict[f'{hf_i}.up_proj.lora_B.weight'] = lora_B[i][1].clone()
+                                hf_state_dict[f'{hf_i}.gate_proj.lora_A.weight'] = self._cpu_clone(lora_A[i])
+                                hf_state_dict[f'{hf_i}.up_proj.lora_A.weight'] = self._cpu_clone(lora_A[i])
+                                hf_state_dict[f'{hf_i}.gate_proj.lora_B.weight'] = self._cpu_clone(lora_B[i][0])
+                                hf_state_dict[f'{hf_i}.up_proj.lora_B.weight'] = self._cpu_clone(lora_B[i][1])
+                            torch.cuda.empty_cache()
                         else:
-                            hf_state_dict['gate_proj.lora_A.weight'] = lora_A.clone()
-                            hf_state_dict['up_proj.lora_A.weight'] = lora_A.clone()
-                            hf_state_dict['gate_proj.lora_B.weight'] = lora_B[0].clone()
-                            hf_state_dict['up_proj.lora_B.weight'] = lora_B[1].clone()
+                            hf_state_dict['gate_proj.lora_A.weight'] = self._cpu_clone(lora_A)
+                            hf_state_dict['up_proj.lora_A.weight'] = self._cpu_clone(lora_A)
+                            hf_state_dict['gate_proj.lora_B.weight'] = self._cpu_clone(lora_B[0])
+                            hf_state_dict['up_proj.lora_B.weight'] = self._cpu_clone(lora_B[1])
+                            torch.cuda.empty_cache()
             elif not self._is_peft_format:
                 if mg_mlp is None:
                     fc1_weight = None
@@ -725,27 +753,29 @@ def _set_mlp_state(self,
                                 if 'gate_up_proj' in hf_state_dict:
                                     gate_up_proj_weight = torch.concat(
                                         [hf_state_dict['gate_up_proj'], gate_up_proj_weight], dim=0)
-                                hf_state_dict['gate_up_proj'] = gate_up_proj_weight.clone()
+                                hf_state_dict['gate_up_proj'] = self._cpu_clone(gate_up_proj_weight)
                             else:
                                 for i in range(num_local_experts):
                                     hf_i = i + ep_rank * num_local_experts
-                                    hf_state_dict[f'{hf_i}.gate_up_proj.weight'] = gate_up_proj_weight[i].clone()
-                            del gate_up_proj_weight
+                                    hf_state_dict[f'{hf_i}.gate_up_proj.weight'] = self._cpu_clone(
+                                        gate_up_proj_weight[i])
+                                del gate_up_proj_weight
+                                torch.cuda.empty_cache()
                         else:
-                            hf_state_dict['gate_up_proj.weight'] = gate_up_proj_weight.view(
-                                -1, gate_up_proj_weight.shape[-1]).clone()
+                            hf_state_dict['gate_up_proj.weight'] = self._cpu_clone(
+                                gate_up_proj_weight.view(-1, gate_up_proj_weight.shape[-1]))
                     else:
                         if is_expert:
                             gate_up_proj_weight = gate_up_proj_weight.view(num_local_experts, 2, -1,
                                                                            gate_up_proj_weight.shape[-1])
                             for i in range(num_local_experts):
                                 hf_i = i + ep_rank * num_local_experts
-                                hf_state_dict[f'{hf_i}.gate_proj.weight'] = gate_up_proj_weight[i][0].clone()
-                                hf_state_dict[f'{hf_i}.up_proj.weight'] = gate_up_proj_weight[i][1].clone()
+                                hf_state_dict[f'{hf_i}.gate_proj.weight'] = self._cpu_clone(gate_up_proj_weight[i][0])
+                                hf_state_dict[f'{hf_i}.up_proj.weight'] = self._cpu_clone(gate_up_proj_weight[i][1])
                             del gate_up_proj_weight
                         else:
-                            hf_state_dict['gate_proj.weight'] = gate_up_proj_weight[0].clone()
-                            hf_state_dict['up_proj.weight'] = gate_up_proj_weight[1].clone()
+                            hf_state_dict['gate_proj.weight'] = self._cpu_clone(gate_up_proj_weight[0])
+                            hf_state_dict['up_proj.weight'] = self._cpu_clone(gate_up_proj_weight[1])
         # linear_fc2
         if is_expert:
             if to_mcore:
@@ -825,8 +855,8 @@ def _set_mlp_state(self,
                         lora_B = lora_B.view(num_local_experts, -1, lora_B.shape[-1])
                         for i in range(num_local_experts):
                             hf_i = i + ep_rank * num_local_experts
-                            hf_state_dict[f'{hf_i}.down_proj.lora_A.weight'] = lora_A[i].clone()
-                            hf_state_dict[f'{hf_i}.down_proj.lora_B.weight'] = lora_B[i].clone()
+                            hf_state_dict[f'{hf_i}.down_proj.lora_A.weight'] = self._cpu_clone(lora_A[i])
+                            hf_state_dict[f'{hf_i}.down_proj.lora_B.weight'] = self._cpu_clone(lora_B[i])
                 elif not self._is_peft_format:
                     if mg_mlp is None:
                         fc2_weight = None
@@ -838,17 +868,20 @@ def _set_mlp_state(self,
                                                   dim=0)
                     down_proj_weight = self._get_weight(fc2_weight, 'linear_fc2.weight', is_expert=is_expert)
                     del fc2_weight
+                    torch.cuda.empty_cache()
                     if down_proj_weight is not None:
                         down_proj_weight = down_proj_weight.view(num_local_experts, -1, down_proj_weight.shape[-1])
                         if hf_grouped:
                             down_proj_weight = down_proj_weight.transpose(1, 2)
                             if 'down_proj' in hf_state_dict:
                                 down_proj_weight = torch.concat([hf_state_dict['down_proj'], down_proj_weight], dim=0)
-                            hf_state_dict['down_proj'] = down_proj_weight.clone()
+                            hf_state_dict['down_proj'] = self._cpu_clone(down_proj_weight)
                         else:
                             for i in range(num_local_experts):
                                 hf_i = i + ep_rank * num_local_experts
-                                hf_state_dict[f'{hf_i}.down_proj.weight'] = down_proj_weight[i].clone()
+                                hf_state_dict[f'{hf_i}.down_proj.weight'] = self._cpu_clone(down_proj_weight[i])
+                        del down_proj_weight
+                        torch.cuda.empty_cache()
         else:
             self._set_state_dict(
                 mg_mlp, 'linear_fc2.weight', hf_state_dict, 'down_proj.weight', to_mcore, is_expert=is_expert)
diff --git a/swift/megatron/tuners/lora.py b/swift/megatron/tuners/lora.py
@@ -370,6 +370,7 @@ def get_delta_weights(self, adapter) -> List[torch.Tensor]:
         assert len(weight_A) == len(weight_B)
         for i in range(len(weight_B)):
             output_tensor.append(transpose(weight_B[i] @ weight_A[i], self.fan_in_fan_out) * self.scaling[adapter])
+            torch.cuda.empty_cache()
 
         return output_tensor
 
@@ -417,10 +418,14 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N
                             weight.data = orig_weights[i]
                     else:
                         base_layer.weight.data = orig_weights[0]
+                    del orig_weights
+                    torch.cuda.empty_cache()
                 else:
                     delta_weights = self.get_delta_weights(active_adapter)
                     for orig_weight, delta_weight in zip(orig_weights, delta_weights):
                         orig_weight.data += delta_weight
+                    del delta_weights
+                    torch.cuda.empty_cache()
                 self.merged_adapters.append(active_adapter)
         if origin_device.type == 'cpu':
             self.to(device=origin_device)
@@ -452,6 +457,8 @@ def unmerge(self) -> None:
                 for orig_weight, delta_weight in zip(orig_weights, delta_weights):
                     # Subtract the delta weight to unmerge
                     orig_weight.data -= delta_weight
+                del delta_weights
+                torch.cuda.empty_cache()
 
         # Clear the merged adapters list
         self.merged_adapters = []