sgl-project
diff --git a/‎python/sglang/srt/_custom_ops.py‎
Lines changed: 41 additions & 1 deletion b/‎python/sglang/srt/_custom_ops.py‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎python/sglang/srt/configs/model_config.py‎
Lines changed: 3 additions & 0 deletions b/‎python/sglang/srt/configs/model_config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sglang/srt/layers/attention/ascend_backend.py‎
Lines changed: 136 additions & 96 deletions b/‎python/sglang/srt/layers/attention/ascend_backend.py‎
Lines changed: 136 additions & 96 deletions
diff --git a/‎python/sglang/srt/layers/communicator.py‎
Lines changed: 2 additions & 3 deletions b/‎python/sglang/srt/layers/communicator.py‎
Lines changed: 2 additions & 3 deletions
@@ -4,13 +4,53 @@
 
 import torch
 
-from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu, is_npu
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    get_bool_env_var,
+    get_cmo_stream,
+    is_hip,
+    is_hpu,
+    is_npu,
+)
 
 logger = logging.getLogger(__name__)
 use_vllm_custom_allreduce = get_bool_env_var(
     "USE_VLLM_CUSTOM_ALLREDUCE", default="false"
 )
 
+
+import sglang.srt.utils
+
+
+@torch.library.custom_op("sglang::wait_cmo_stream", mutates_args=())
+def wait_cmo_stream() -> None:
+    if is_npu() and get_cmo_stream():
+        sglang.srt.utils.wait_cmo_stream()
+
+
+@wait_cmo_stream.register_fake
+def wait_cmo_stream_fake() -> None:
+    pass
+
+
+def prepare_weight_cache(handle: torch.Tensor, cache: List[torch.Tensor]) -> None:
+    sglang.srt.utils.prepare_weight_cache(handle, cache)
+
+
+def prepare_weight_cache_register_fake(
+    handle: torch.Tensor, cache: List[torch.Tensor]
+) -> None:
+    pass
+
+
+direct_register_custom_op(
+    op_name="prepare_weight_cache",
+    op_func=prepare_weight_cache,
+    mutates_args=["handle"],
+    fake_impl=prepare_weight_cache_register_fake,
+)
+
+
 if not is_hpu():
     # ROCm does not use vllm custom allreduce
     if use_vllm_custom_allreduce and not is_hip():
 
@@ -97,6 +97,7 @@ def __init__(
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
         sampling_defaults: str = "openai",
         quantize_and_serve: bool = False,
+        enable_torch_compile: bool = False,
     ) -> None:
         # Parse args
         self.model_path = model_path
@@ -106,6 +107,7 @@ def __init__(
         self.model_impl = model_impl
         self.sampling_defaults = sampling_defaults
         self.quantize_and_serve = quantize_and_serve
+        self.enable_torch_compile = enable_torch_compile
 
         # Validate quantize_and_serve configuration
         self._validate_quantize_and_serve_config()
@@ -234,6 +236,7 @@ def from_server_args(
             model_impl=server_args.model_impl,
             sampling_defaults=server_args.sampling_defaults,
             quantize_and_serve=server_args.quantize_and_serve,
+            enable_torch_compile=server_args.enable_torch_compile,
             **kwargs,
         )
 
 
@@ -74,6 +74,7 @@ def update_verify_buffers_to_fill_after_draft(
 
     def __init__(self, model_runner: ModelRunner):
         super().__init__()
+        self.enable_torch_compile = False
         self.forward_metadata = None
         self.device = model_runner.device
         self.page_size = model_runner.page_size
@@ -576,112 +577,151 @@ def forward_decode_graph(
                     layer, forward_batch.out_cache_loc, k, v
                 )
 
-        if not self.use_mla:
-            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
-                layer.layer_id
-            ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
-            v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
-                layer.layer_id
-            ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
-            query = q.reshape(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
-            if self.forward_metadata.seq_lens_cpu_int is None:
-                actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
-            else:
-                actual_seq_len_kv = (
-                    self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
-                )
+        if not self.use_mla and self.enable_torch_compile:
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            v_cache = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+            query = q.reshape(-1, layer.tp_q_head_num, layer.qk_head_dim)
             num_tokens = query.shape[0]
-            workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
-                query,
-                k_cache,
-                v_cache,
-                block_table=self.forward_metadata.block_tables,
-                block_size=self.page_size,
-                num_heads=layer.tp_q_head_num,
-                num_key_value_heads=layer.tp_k_head_num,
-                input_layout="BSH",
-                scale=layer.scaling,
-                actual_seq_lengths_kv=actual_seq_len_kv,
-            )
-            output = torch.empty(
-                (num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
-                dtype=q.dtype,
-                device=q.device,
-            )
-            softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
-            torch_npu.npu_fused_infer_attention_score.out(
-                query,
-                k_cache,
-                v_cache,
-                block_table=self.forward_metadata.block_tables,
-                block_size=self.page_size,
-                num_heads=layer.tp_q_head_num,
-                num_key_value_heads=layer.tp_k_head_num,
-                input_layout="BSH",
-                scale=layer.scaling,
-                actual_seq_lengths_kv=actual_seq_len_kv,
-                workspace=workspace,
-                out=[output, softmax_lse],
-            )
-            return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
-        else:
-            c_kv, k_rope = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
-            k_rope_cache = k_rope.view(
-                -1, layer.tp_k_head_num, self.page_size, self.qk_rope_head_dim
-            )
-            c_kv_cache = c_kv.view(
-                -1, layer.tp_v_head_num, self.page_size, self.kv_lora_rank
+            attn_output = torch.empty(
+                (num_tokens, layer.tp_q_head_num, layer.v_head_dim),
+                dtype=query.dtype,
+                device=query.device,
             )
 
-            q_nope = q.view(-1, layer.tp_q_head_num, 1, self.kv_lora_rank).contiguous()
-            q_rope = q_rope.view(-1, layer.tp_q_head_num, 1, self.qk_rope_head_dim)
             if self.forward_metadata.seq_lens_cpu_int is None:
-                actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
-            else:
-                actual_seq_len_kv = (
-                    self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+                actual_seq_len_kv = torch.from_numpy(
+                    np.array(self.forward_metadata.seq_lens_cpu_list).astype(np.int32)
                 )
+            else:
+                actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_int
 
-            workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
-                q_nope,
-                c_kv_cache,
-                c_kv_cache,
-                query_rope=q_rope,
-                key_rope=k_rope_cache,
+            torch_npu._npu_paged_attention(
+                query=query,
+                key_cache=k_cache,
+                value_cache=v_cache,
                 num_heads=layer.tp_q_head_num,
-                num_key_value_heads=layer.tp_k_head_num,
+                num_kv_heads=layer.tp_k_head_num,
+                scale_value=layer.scaling,
                 block_table=self.forward_metadata.block_tables,
-                block_size=self.page_size,
-                input_layout="BNSD",
-                scale=layer.scaling,
-                actual_seq_lengths_kv=actual_seq_len_kv,
-                antiquant_mode=0,
-                antiquant_scale=None,
-                sparse_mode=0,
+                context_lens=actual_seq_len_kv,
+                out=attn_output,
             )
-            output = torch.empty_like(q_nope, dtype=q.dtype, device=q.device)
-            softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+            return attn_output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
+        else:
+            if not self.use_mla:
+                k_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                    layer.layer_id
+                ).view(-1, self.page_size, layer.tp_k_head_num * layer.qk_head_dim)
+                v_cache = forward_batch.token_to_kv_pool.get_value_buffer(
+                    layer.layer_id
+                ).view(-1, self.page_size, layer.tp_v_head_num * layer.v_head_dim)
+                query = q.reshape(-1, 1, layer.tp_q_head_num * layer.qk_head_dim)
+                if self.forward_metadata.seq_lens_cpu_int is None:
+                    actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+                else:
+                    actual_seq_len_kv = (
+                        self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+                    )
+                num_tokens = query.shape[0]
+                workspace = (
+                    torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                        query,
+                        k_cache,
+                        v_cache,
+                        block_table=self.forward_metadata.block_tables,
+                        block_size=self.page_size,
+                        num_heads=layer.tp_q_head_num,
+                        num_key_value_heads=layer.tp_k_head_num,
+                        input_layout="BSH",
+                        scale=layer.scaling,
+                        actual_seq_lengths_kv=actual_seq_len_kv,
+                    )
+                )
+                output = torch.empty(
+                    (num_tokens, 1, layer.tp_q_head_num * layer.v_head_dim),
+                    dtype=q.dtype,
+                    device=q.device,
+                )
+                softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+                torch_npu.npu_fused_infer_attention_score.out(
+                    query,
+                    k_cache,
+                    v_cache,
+                    block_table=self.forward_metadata.block_tables,
+                    block_size=self.page_size,
+                    num_heads=layer.tp_q_head_num,
+                    num_key_value_heads=layer.tp_k_head_num,
+                    input_layout="BSH",
+                    scale=layer.scaling,
+                    actual_seq_lengths_kv=actual_seq_len_kv,
+                    workspace=workspace,
+                    out=[output, softmax_lse],
+                )
+                return output.view(num_tokens, layer.tp_q_head_num * layer.v_head_dim)
+            else:
+                c_kv, k_rope = forward_batch.token_to_kv_pool.get_kv_buffer(
+                    layer.layer_id
+                )
+                k_rope_cache = k_rope.view(
+                    -1, layer.tp_k_head_num, self.page_size, self.qk_rope_head_dim
+                )
+                c_kv_cache = c_kv.view(
+                    -1, layer.tp_v_head_num, self.page_size, self.kv_lora_rank
+                )
 
-            torch_npu.npu_fused_infer_attention_score.out(
-                q_nope,
-                c_kv_cache,
-                c_kv_cache,
-                query_rope=q_rope,
-                key_rope=k_rope_cache,
-                num_heads=layer.tp_q_head_num,
-                num_key_value_heads=layer.tp_k_head_num,
-                block_table=self.forward_metadata.block_tables,
-                block_size=self.page_size,
-                input_layout="BNSD",
-                scale=layer.scaling,
-                actual_seq_lengths_kv=actual_seq_len_kv,
-                antiquant_mode=0,
-                antiquant_scale=None,
-                sparse_mode=0,
-                workspace=workspace,
-                out=[output, softmax_lse],
-            )
-            return output.view(-1, layer.tp_q_head_num * self.kv_lora_rank)
+                q_nope = q.view(
+                    -1, layer.tp_q_head_num, 1, self.kv_lora_rank
+                ).contiguous()
+                q_rope = q_rope.view(-1, layer.tp_q_head_num, 1, self.qk_rope_head_dim)
+                if self.forward_metadata.seq_lens_cpu_int is None:
+                    actual_seq_len_kv = self.forward_metadata.seq_lens_cpu_list
+                else:
+                    actual_seq_len_kv = (
+                        self.forward_metadata.seq_lens_cpu_int.cpu().int().tolist()
+                    )
+
+                workspace = (
+                    torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                        q_nope,
+                        c_kv_cache,
+                        c_kv_cache,
+                        query_rope=q_rope,
+                        key_rope=k_rope_cache,
+                        num_heads=layer.tp_q_head_num,
+                        num_key_value_heads=layer.tp_k_head_num,
+                        block_table=self.forward_metadata.block_tables,
+                        block_size=self.page_size,
+                        input_layout="BNSD",
+                        scale=layer.scaling,
+                        actual_seq_lengths_kv=actual_seq_len_kv,
+                        antiquant_mode=0,
+                        antiquant_scale=None,
+                        sparse_mode=0,
+                    )
+                )
+                output = torch.empty_like(q_nope, dtype=q.dtype, device=q.device)
+                softmax_lse = torch.empty(1, dtype=q.dtype, device=q.device)
+
+                torch_npu.npu_fused_infer_attention_score.out(
+                    q_nope,
+                    c_kv_cache,
+                    c_kv_cache,
+                    query_rope=q_rope,
+                    key_rope=k_rope_cache,
+                    num_heads=layer.tp_q_head_num,
+                    num_key_value_heads=layer.tp_k_head_num,
+                    block_table=self.forward_metadata.block_tables,
+                    block_size=self.page_size,
+                    input_layout="BNSD",
+                    scale=layer.scaling,
+                    actual_seq_lengths_kv=actual_seq_len_kv,
+                    antiquant_mode=0,
+                    antiquant_scale=None,
+                    sparse_mode=0,
+                    workspace=workspace,
+                    out=[output, softmax_lse],
+                )
+                return output.view(-1, layer.tp_q_head_num * self.kv_lora_rank)
 
     def forward_decode(
         self,
 
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 SGLang Team
+# Copyright 2023-2025 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -51,7 +51,6 @@
     is_hip,
     is_sm90_supported,
     is_sm100_supported,
-    prepare_weight_cache,
 )
 
 _is_flashinfer_available = is_flashinfer_available()
@@ -567,7 +566,7 @@ def _gather_hidden_states_and_residual(
             else:
                 hidden_states = tensor_model_parallel_all_reduce(hidden_states)
                 if context.cache is not None:
-                    _ = prepare_weight_cache(hidden_states, context.cache)
+                    torch.ops.sglang.prepare_weight_cache(hidden_states, context.cache)
                 hidden_states, residual = layernorm(hidden_states, residual)
         return hidden_states, residual