fix lora id issue

Jonahcb · Jonahcb · commit 367612aef1c7 · 2025-11-29T00:55:42.000Z
diff --git a/python/sglang/srt/layers/moe/lora_moe.py b/python/sglang/srt/layers/moe/lora_moe.py
@@ -104,17 +104,14 @@ def _compute_lora_delta(
         num_loras = self.lora_a_weights.shape[0]
 
         # Dispatch tokens to experts
-        token_ids, expert_ids, _ = moe_dispatch(
+        token_ids, expert_ids, _, lora_ids = moe_dispatch(
             topk_ids=topk_ids,
             topk_weights=topk_weights,
             lora_indices=lora_indices,
             num_experts=num_experts,
             num_loras=num_loras,
         )
 
-        # Get LoRA IDs for dispatched tokens
-        lora_ids = lora_indices[token_ids]
-
 
 
         # Compute per-expert LoRA forward (adds to base_output in-place)
diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py
@@ -285,7 +285,7 @@ def prepare_lora_batch(self, forward_batch: ForwardBatch):
 
         # Populate per-token LoRA indices from segment information
         batch_info = self.lora_backend.batch_info
-        num_tokens = forward_batch.batch_size
+        num_tokens = forward_batch.seq_lens_sum  # Total tokens across all sequences
         if batch_info.permutation is None:
             # No reordering (e.g., triton backend): segments are in original order
             token_lora_indices = torch.empty(num_tokens, dtype=torch.int32, device=batch_info.weight_indices.device)
diff --git a/python/sglang/srt/lora/moe_dispatch.py b/python/sglang/srt/lora/moe_dispatch.py
@@ -23,7 +23,7 @@ def moe_dispatch(
     lora_indices: torch.Tensor,
     num_experts: int,
     num_loras: int,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Dispatch tokens to experts for MoE computation.
 
@@ -38,6 +38,7 @@ def moe_dispatch(
         sorted_token_ids: Token indices sorted by expert_id
         sorted_expert_ids: Corresponding expert IDs
         sorted_weights: Corresponding router weights
+        sorted_lora_ids: LoRA adapter IDs for each dispatched token
     """
     num_tokens, top_k = topk_ids.shape
     device = topk_ids.device
@@ -46,15 +47,17 @@ def moe_dispatch(
     flat_topk_ids = topk_ids.flatten()
     flat_topk_weights = topk_weights.flatten()
     flat_token_ids = torch.arange(num_tokens, device=device).repeat_interleave(top_k)
+    flat_lora_ids = lora_indices.repeat_interleave(top_k)
 
     # Sort by expert_id only (each expert uses same LoRA adapter logic)
-    composite_key = flat_topk_ids
-
-    # Sort by expert_id to group tokens by expert
-    sorted_indices = torch.argsort(composite_key)
+    sorted_indices = torch.argsort(flat_topk_ids)
 
     sorted_token_ids = flat_token_ids[sorted_indices]
     sorted_expert_ids = flat_topk_ids[sorted_indices]
     sorted_weights = flat_topk_weights[sorted_indices]
 
-    return sorted_token_ids, sorted_expert_ids, sorted_weights
+    if flat_lora_ids.shape != sorted_indices.shape:
+        y = 1 # need to pause
+    sorted_lora_ids = flat_lora_ids[sorted_indices]
+
+    return sorted_token_ids, sorted_expert_ids, sorted_weights, sorted_lora_ids
diff --git a/test/srt/lora/test_lora_moe.py b/test/srt/lora/test_lora_moe.py