fix - fp8 per block load quant need skip [W.mla_kc, W.mla_vc]

jianglan89 · Nancheng-11 · commit d30dc5183752 · 2025-12-01T17:49:07.000+08:00
diff --git a/rtp_llm/model_loader/per_block_fp8_quant_weight.py b/rtp_llm/model_loader/per_block_fp8_quant_weight.py
@@ -8,13 +8,13 @@
 from rtp_llm.model_loader.attn_weight import AttnAtomicWeight, MlaAttnAtomicWeight
 from rtp_llm.model_loader.ffn_weight import FfnAtomicWeight, MoeAtomicWeight
 from rtp_llm.model_loader.load_config import LoadConfig
+from rtp_llm.model_loader.tensor_source import TensorSource
 from rtp_llm.model_loader.weight_module import (
     AtomicWeight,
     CompositeWeight,
     QuantWeight,
     WeightModule,
 )
-from rtp_llm.model_loader.tensor_source import TensorSource
 from rtp_llm.utils.model_weight import (
     FP8_E4M3_MAX,
     CkptWeightInfo,
@@ -710,7 +710,7 @@ def support(
         ):
             return False
         name = src_weight_info.name
-        return name in cls.w8a8_weight_list
+        return name in cls.w8a8_weight_list and name not in [W.mla_kc, W.mla_vc]
 
     def __init__(
         self,
@@ -750,7 +750,9 @@ def _load_raw_tensor(
         device: str,
         load_config: LoadConfig,
     ):
-        kernel = self.kernel._load_raw_tensor(tensor_source, layer_id, device, load_config)
+        kernel = self.kernel._load_raw_tensor(
+            tensor_source, layer_id, device, load_config
+        )
 
         res = {}
         scale = None
@@ -774,7 +776,7 @@ def _load_raw_tensor(
             res.update({self.scale.name: scale.contiguous().to(device)})
 
         return res
-    
+
     def get_tensor_names(
         self, layer_id: Optional[int], load_config: LoadConfig
     ) -> set[str]: