Merge branch 'main' into dev/sbo.public

Sulfur6 · web-flow · commit 13bd3a701ab0 · 2025-11-27T23:48:39.000+08:00
diff --git a/docker/b300.Dockerfile b/docker/b300.Dockerfile
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
@@ -316,6 +316,7 @@ def __init__(
         self.deepep_mode = deepep_mode
 
         self.params_bytes = 2
+        # A large value will lead to large memory occupation, thus users should change it accordingly
         self.num_max_dispatch_tokens_per_rank = get_int_env_var(
             "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 128
         )
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -764,22 +764,45 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     w2_weight_scale, requires_grad=False
                 )
                 layer.w2_input_scale = None
-
-            if _use_aiter:
+            elif _use_aiter:
                 # Pre-shuffle weights
                 layer.w13_weight.data = shuffle_weight(
                     layer.w13_weight.contiguous(), (16, 16)
                 )
                 layer.w2_weight.data = shuffle_weight(
                     layer.w2_weight.contiguous(), (16, 16)
                 )
-
-            if _is_cpu:
+            elif _is_cpu:
                 assert (
                     _is_cpu_amx_available
                 ), "Fp8MoEMethod on CPU requires that CPU has AMX support"
                 _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+            else:
+                # For fp8 moe run with deepgemm, the expert weights and scales need be requantized to ue8m0
+                from sglang.srt.layers.moe import get_moe_runner_backend
+                from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE
+                from sglang.srt.model_loader.utils import (
+                    should_deepgemm_weight_requant_ue8m0,
+                )
 
+                if (
+                    should_deepgemm_weight_requant_ue8m0(
+                        weight_block_size=getattr(
+                            self.quant_config, "weight_block_size", None
+                        ),
+                    )
+                    and get_moe_runner_backend().is_deep_gemm()
+                ):
+                    assert isinstance(
+                        layer, DeepEPMoE
+                    ), "DeepGemm MoE is only supported with DeepEPMoE"
+                    weight_block_size = self.quant_config.weight_block_size
+                    requant_weight_ue8m0_inplace(
+                        layer.w13_weight, layer.w13_weight_scale_inv, weight_block_size
+                    )
+                    requant_weight_ue8m0_inplace(
+                        layer.w2_weight, layer.w2_weight_scale_inv, weight_block_size
+                    )
             return
 
         # If checkpoint is fp16 or bfloat16, quantize in place.
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
@@ -114,7 +114,6 @@
     inverse_transform_scale_ue8m0,
     normalize_e4m3fn_to_e4m3fnuz,
     quant_weight_ue8m0,
-    requant_weight_ue8m0_inplace,
     transform_scale_ue8m0_inplace,
 )
 from sglang.srt.layers.quantization.int8_utils import (
@@ -3738,43 +3737,9 @@ def post_load_weights(self, is_nextn=False, weight_names=None):
                 self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
                 self_attn.use_deep_gemm_bmm = True
 
-        # Requant the weights and scales of MoE layers
-        if get_moe_runner_backend().is_deep_gemm():
-            self._maybe_moe_weight_requant_ue8m0(is_nextn)
         if is_nextn and enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
             self._transform_scale_nextn_moe_ue8m0()
 
-    def _maybe_moe_weight_requant_ue8m0(self, is_nextn=False):
-        # Dense fp8 layers will be processed in Fp8LinearMethod.process_weights_after_loading
-        # So we only need to process sparse MoE layers here
-        weight_block_size = self.quant_config.weight_block_size
-
-        moe_layers = list(
-            range(
-                self.config.first_k_dense_replace,
-                self.config.num_hidden_layers,
-                self.config.moe_layer_freq,
-            )
-        )
-
-        num_hidden_layers = 1 if is_nextn else self.config.num_hidden_layers
-
-        for layer_id in range(num_hidden_layers):
-            if is_nextn:
-                layer = self.model.decoder
-            else:
-                layer = self.model.layers[layer_id]
-
-            if layer_id in moe_layers or is_nextn:
-                experts = layer.mlp.experts
-                # TODO: move this logic to Fp8MoEMethod.process_weights_after_loading
-                if isinstance(experts, DeepEPMoE):
-                    for w in [
-                        (experts.w13_weight, experts.w13_weight_scale_inv),
-                        (experts.w2_weight, experts.w2_weight_scale_inv),
-                    ]:
-                        requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
-
     # TODO avoid code dup (currently combine from weight_requant_ue8m0 and transform_scale_ue8m0)
     def _transform_scale_nextn_moe_ue8m0(self):
         layer = self.model.decoder
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -1517,6 +1517,11 @@ def _handle_moe_kernel_config(self):
                 self.ep_size == 1
             ), "FP8 Cutlass MoE is only supported with ep_size == 1"
 
+        if self.moe_runner_backend == "deep_gemm":
+            assert (
+                self.ep_size > 1 and self.moe_a2a_backend == "deepep"
+            ), "DeepGemm MoE runner is only supported when ep is enabled and moe_a2a_backend is deepep"
+
     def _handle_a2a_moe(self):
         if self.moe_a2a_backend == "deepep":
             if self.deepep_mode == "normal":

Original file line number	Diff line number	Diff line change
`@@ -316,6 +316,7 @@ def __init__(`
`316`	`316`	`self.deepep_mode = deepep_mode`
`317`	`317`
`318`	`318`	`self.params_bytes = 2`
	`319`	`+ # A large value will lead to large memory occupation, thus users should change it accordingly`
`319`	`320`	`self.num_max_dispatch_tokens_per_rank = get_int_env_var(`
`320`	`321`	`"SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 128`
`321`	`322`	`)`