cp

fzyzcjy · fzyzcjy · commit 52a1e8f4b2ae · 2025-11-25T16:43:05.000+08:00
diff --git a/python/sglang/srt/layers/linear.py b/python/sglang/srt/layers/linear.py
@@ -736,7 +736,8 @@ def weight_loader_v2(
 
         if isinstance(param, BlockQuantScaleParameter):
             weight_block_size = self.quant_method.quant_config.weight_block_size
-            block_n, _ = weight_block_size[0], weight_block_size[1]
+            raw_block_n, _ = weight_block_size[0], weight_block_size[1]
+            block_n = 1 if getattr(param, "format_ue8m0", False) else raw_block_n
             shard_offset = (
                 (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
             ) // self.tp_size
@@ -966,7 +967,8 @@ def weight_loader_v2(
 
         if isinstance(param, BlockQuantScaleParameter):
             weight_block_size = self.quant_method.quant_config.weight_block_size
-            block_n, _ = weight_block_size[0], weight_block_size[1]
+            raw_block_n, _ = weight_block_size[0], weight_block_size[1]
+            block_n = 1 if getattr(param, "format_ue8m0", False) else raw_block_n
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
 
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
@@ -262,7 +262,6 @@ def create_weights(
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.orig_dtype = params_dtype
-        layer.executed_weight_requant_ue8m0 = False
 
         # WEIGHT
         weight_dtype = (
@@ -300,6 +299,7 @@ def create_weights(
                     output_dim=0,
                     weight_loader=weight_loader,
                 )
+                scale.format_ue8m0 = False
                 scale[:] = torch.finfo(torch.float32).min
                 layer.register_parameter("weight_scale_inv", scale)
             else:
@@ -367,14 +367,14 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         self.w8a8_block_fp8_linear
                         is deepgemm_w8a8_block_fp8_linear_with_fallback
                     )
-                    and (not layer.executed_weight_requant_ue8m0)
+                    and (not layer.weight_scale_inv.format_ue8m0)
                 ):
                     requant_weight_ue8m0_inplace(
                         layer.weight,
                         layer.weight_scale_inv,
                         self.quant_config.weight_block_size,
                     )
-                    layer.executed_weight_requant_ue8m0 = True
+                    layer.weight_scale_inv.format_ue8m0 = True
                 weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
 
             layer.weight.data = weight.data