Skip to content

Commit 13bd3a7

Browse files
authored
Merge branch 'main' into dev/sbo.public
2 parents 0596bbf + 63b0562 commit 13bd3a7

File tree

5 files changed

+33
-94
lines changed

5 files changed

+33
-94
lines changed

docker/b300.Dockerfile

Lines changed: 0 additions & 55 deletions
This file was deleted.

python/sglang/srt/layers/moe/token_dispatcher/deepep.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def __init__(
316316
self.deepep_mode = deepep_mode
317317

318318
self.params_bytes = 2
319+
# A large value will lead to large memory occupation, thus users should change it accordingly
319320
self.num_max_dispatch_tokens_per_rank = get_int_env_var(
320321
"SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK", 128
321322
)

python/sglang/srt/layers/quantization/fp8.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -764,22 +764,45 @@ def process_weights_after_loading(self, layer: Module) -> None:
764764
w2_weight_scale, requires_grad=False
765765
)
766766
layer.w2_input_scale = None
767-
768-
if _use_aiter:
767+
elif _use_aiter:
769768
# Pre-shuffle weights
770769
layer.w13_weight.data = shuffle_weight(
771770
layer.w13_weight.contiguous(), (16, 16)
772771
)
773772
layer.w2_weight.data = shuffle_weight(
774773
layer.w2_weight.contiguous(), (16, 16)
775774
)
776-
777-
if _is_cpu:
775+
elif _is_cpu:
778776
assert (
779777
_is_cpu_amx_available
780778
), "Fp8MoEMethod on CPU requires that CPU has AMX support"
781779
_amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
780+
else:
781+
# For fp8 moe run with deepgemm, the expert weights and scales need be requantized to ue8m0
782+
from sglang.srt.layers.moe import get_moe_runner_backend
783+
from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE
784+
from sglang.srt.model_loader.utils import (
785+
should_deepgemm_weight_requant_ue8m0,
786+
)
782787

788+
if (
789+
should_deepgemm_weight_requant_ue8m0(
790+
weight_block_size=getattr(
791+
self.quant_config, "weight_block_size", None
792+
),
793+
)
794+
and get_moe_runner_backend().is_deep_gemm()
795+
):
796+
assert isinstance(
797+
layer, DeepEPMoE
798+
), "DeepGemm MoE is only supported with DeepEPMoE"
799+
weight_block_size = self.quant_config.weight_block_size
800+
requant_weight_ue8m0_inplace(
801+
layer.w13_weight, layer.w13_weight_scale_inv, weight_block_size
802+
)
803+
requant_weight_ue8m0_inplace(
804+
layer.w2_weight, layer.w2_weight_scale_inv, weight_block_size
805+
)
783806
return
784807

785808
# If checkpoint is fp16 or bfloat16, quantize in place.

python/sglang/srt/models/deepseek_v2.py

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@
114114
inverse_transform_scale_ue8m0,
115115
normalize_e4m3fn_to_e4m3fnuz,
116116
quant_weight_ue8m0,
117-
requant_weight_ue8m0_inplace,
118117
transform_scale_ue8m0_inplace,
119118
)
120119
from sglang.srt.layers.quantization.int8_utils import (
@@ -3738,43 +3737,9 @@ def post_load_weights(self, is_nextn=False, weight_names=None):
37383737
self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
37393738
self_attn.use_deep_gemm_bmm = True
37403739

3741-
# Requant the weights and scales of MoE layers
3742-
if get_moe_runner_backend().is_deep_gemm():
3743-
self._maybe_moe_weight_requant_ue8m0(is_nextn)
37443740
if is_nextn and enable_nextn_moe_bf16_cast_to_fp8(self.quant_config):
37453741
self._transform_scale_nextn_moe_ue8m0()
37463742

3747-
def _maybe_moe_weight_requant_ue8m0(self, is_nextn=False):
3748-
# Dense fp8 layers will be processed in Fp8LinearMethod.process_weights_after_loading
3749-
# So we only need to process sparse MoE layers here
3750-
weight_block_size = self.quant_config.weight_block_size
3751-
3752-
moe_layers = list(
3753-
range(
3754-
self.config.first_k_dense_replace,
3755-
self.config.num_hidden_layers,
3756-
self.config.moe_layer_freq,
3757-
)
3758-
)
3759-
3760-
num_hidden_layers = 1 if is_nextn else self.config.num_hidden_layers
3761-
3762-
for layer_id in range(num_hidden_layers):
3763-
if is_nextn:
3764-
layer = self.model.decoder
3765-
else:
3766-
layer = self.model.layers[layer_id]
3767-
3768-
if layer_id in moe_layers or is_nextn:
3769-
experts = layer.mlp.experts
3770-
# TODO: move this logic to Fp8MoEMethod.process_weights_after_loading
3771-
if isinstance(experts, DeepEPMoE):
3772-
for w in [
3773-
(experts.w13_weight, experts.w13_weight_scale_inv),
3774-
(experts.w2_weight, experts.w2_weight_scale_inv),
3775-
]:
3776-
requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
3777-
37783743
# TODO avoid code dup (currently combine from weight_requant_ue8m0 and transform_scale_ue8m0)
37793744
def _transform_scale_nextn_moe_ue8m0(self):
37803745
layer = self.model.decoder

python/sglang/srt/server_args.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,11 @@ def _handle_moe_kernel_config(self):
15171517
self.ep_size == 1
15181518
), "FP8 Cutlass MoE is only supported with ep_size == 1"
15191519

1520+
if self.moe_runner_backend == "deep_gemm":
1521+
assert (
1522+
self.ep_size > 1 and self.moe_a2a_backend == "deepep"
1523+
), "DeepGemm MoE runner is only supported when ep is enabled and moe_a2a_backend is deepep"
1524+
15201525
def _handle_a2a_moe(self):
15211526
if self.moe_a2a_backend == "deepep":
15221527
if self.deepep_mode == "normal":

0 commit comments

Comments
 (0)