sgl-project · eshoguli · Sep 17, 2025 · Oct 31, 2025 · Nov 1, 2025 · Nov 5, 2025
diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
@@ -4,11 +4,47 @@
 
 import torch
 
-from sglang.srt.utils import is_hip, is_hpu, is_npu
+from sglang.srt.utils import direct_register_custom_op, is_hip, is_hpu, is_npu
 
 logger = logging.getLogger(__name__)
 
 
+import sglang.srt.utils
+
+
+@torch.library.custom_op("sglang::wait_cmo_stream", mutates_args=())
+def wait_cmo_stream() -> None:
+    if sglang.srt.utils.get_cmo_stream():
+        sglang.srt.utils.wait_cmo_stream()
+
+
+@wait_cmo_stream.register_fake
+def wait_cmo_stream_fake() -> None:
+    pass
+
+
+def get_cmo_stream() -> bool:
+    return True
+
+
+def prepare_weight_cache(handle: torch.Tensor, cache: List[torch.Tensor]) -> None:
+    sglang.srt.utils.prepare_weight_cache(handle, cache)
+
+
+def prepare_weight_cache_register_fake(
+    handle: torch.Tensor, cache: List[torch.Tensor]
+) -> None:
+    pass
+
+
+direct_register_custom_op(
+    op_name="prepare_weight_cache",
+    op_func=prepare_weight_cache,
+    mutates_args=["handle"],
+    fake_impl=prepare_weight_cache_register_fake,
+)
+
+
 if not is_hpu():
     try:
         import sgl_kernel

@@ -1,14 +1,23 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/compilation_config.py
 
-from typing import List
+import json
+from typing import List, Optional
 
 
 # TODO(Yuwei): support better compile config support
 class CompilationConfig:
-    def __init__(self, capture_sizes: List[int], compiler: str = "eager"):
+    splitting_ops: Optional[list[str]] = None
+
+    def __init__(
+        self,
+        capture_sizes: List[int] = [],
+        compiler: str = "eager",
+        splitting_ops: list[str] = [],
+    ):
         self.traced_files = set()
         self.capture_sizes = capture_sizes
         self.compiler = compiler
+        self.splitting_ops = splitting_ops
 
     def add_traced_file(self, file_path: str):
         self.traced_files.add(file_path)
@@ -18,3 +27,8 @@ def get_traced_files(self):
 
     def get_capture_sizes(self):
         return self.capture_sizes
+
+    @classmethod
+    def from_cli(cls, args) -> "CompilationConfig":
+        args_dict = json.loads(args)
+        return CompilationConfig(**args_dict)
@@ -0,0 +1,52 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import List, Optional
+
+import torch
+
+import sglang.srt.layers.dp_attention
+
+
+@torch.library.custom_op("sglang::_set_dp_buffer_len", mutates_args=())
+def _set_dp_buffer_len(
+    global_dp_buffer_len: Optional[int],
+    num_tokens: Optional[int],
+    is_max_len: bool,
+    global_num_tokens: Optional[List[int]] = None,
+) -> None:
+    global set_dp_buffer_len_original
+    sglang.srt.layers.dp_attention.set_dp_buffer_len(
+        global_dp_buffer_len, num_tokens, is_max_len, global_num_tokens
+    )
+
+
+@_set_dp_buffer_len.register_fake
+def _set_dp_buffer_len_fake(
+    global_dp_buffer_len: Optional[int],
+    num_tokens: Optional[int],
+    is_max_len: bool,
+    global_num_tokens: Optional[List[int]] = None,
+) -> None:
+    pass
+
+
+@torch.library.custom_op("sglang::_set_is_extend_in_batch", mutates_args=())
+def _set_is_extend_in_batch(is_extend_in_batch: bool) -> None:
+    sglang.srt.layers.dp_attention.set_is_extend_in_batch(is_extend_in_batch)
+
+
+@_set_is_extend_in_batch.register_fake
+def _set_is_extend_in_batch_fake(is_extend_in_batch: bool) -> None:
+    pass
@@ -0,0 +1,20 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch_npu
+
+
+class CompilationContext:
+    graph_memory_pool = None
+    stream: torch_npu.npu.Stream = None
@@ -0,0 +1,29 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch
+
+from sglang.srt.compilation.npu.npu_graph_compiler_backend import (
+    NpuGraphCompilerBackend,
+)
+
+
+class NpuGraphCompiler:
+    def __init__(self, model: torch.nn.Module, model_type: torch.dtype):
+        torch._dynamo.reset()
+
+        self.backend = NpuGraphCompilerBackend(model_type)
+        self.compiled_callable = torch.compile(
+            model, fullgraph=True, dynamic=False, backend=self.backend
+        )
@@ -0,0 +1,46 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Callable
+
+import torch
+from torch._dynamo.eval_frame import DisableContext
+
+from sglang.srt.compilation.npu.pass_manager import PassManager
+from sglang.srt.compilation.npu.passes.w8a8_int8 import (
+    DivFuse,
+    EraseCopy,
+    NpuAddRmsNormQuantFuse,
+)
+
+
+class NpuGraphCompilerBackend:
+    def __init__(self, model_type: torch.dtype):
+        self.model_type = model_type
+
+    def __call__(self, graph: torch.fx.GraphModule, example_inputs) -> Callable:
+        DisableContext.compiled_function_args[DisableContext.batch_size] = (
+            example_inputs
+        )
+        if self.model_type == torch.bfloat16:
+            NpuGraphCompilerBackend.apply_passes(graph)
+        return graph
+
+    def apply_passes(graph_module: torch.fx.GraphModule):
+        passManager = PassManager(graph_module)
+        passManager.add(NpuAddRmsNormQuantFuse)
+        passManager.add(DivFuse)
+        passManager.add(EraseCopy)
+        passManager.apply()
+        graph_module.recompile()
@@ -0,0 +1,46 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch
+
+
+class PassManager:
+    def __init__(self, graph_module: torch.fx.GraphModule):
+        self.graph_module = graph_module
+        self.passes = []
+
+    def add(self, pass_):
+        self.passes.append(pass_)
+
+    def apply(self):
+        updated = False
+        for pass_ in self.passes:
+            pass_instance = pass_()
+            results = []
+            try:
+                if callable(pass_instance):
+                    results = pass_instance(self.graph_module)
+                else:
+                    results = torch.fx.replace_pattern(
+                        self.graph_module, pass_.pattern, pass_.replacement
+                    )
+            except:
+                # pass was not applied
+                pass
+
+            if not updated:
+                updated = len(results) != 0
+
+        if updated:
+            self.graph_module.recompile()
@@ -0,0 +1,100 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch
+
+
+class DivFuse:
+    def pattern(x):
+        y = 1.0 / x
+        z = 1.0 / y
+        return z
+
+    def replacement(x):
+        return x
+
+
+class EraseCopy:
+    def __call__(self, graph_module: torch.fx.GraphModule):
+        copy_node = None
+        prepare_weight_cache_default_node = None
+
+        results = []
+        for module in graph_module.modules():
+            for node in list(module.graph.nodes):
+                if node.type == torch.nn.parameter.Parameter:
+                    continue
+                if node.target == "copy_":
+                    copy_node = node
+                    prepare_weight_cache_default_node = None
+                    continue
+
+                if (
+                    copy_node
+                    and node.target == torch.ops.sglang.prepare_weight_cache.default
+                ):
+                    prepare_weight_cache_default_node = node
+                    continue
+
+                if copy_node and node.target == torch.ops.npu.npu_add_rms_norm_quant:
+                    arg = copy_node.args[1]
+
+                    if prepare_weight_cache_default_node is not None:
+                        prepare_weight_cache_default_node.args = (
+                            arg,
+                            prepare_weight_cache_default_node.args[1],
+                        )
+
+                    node.args = (
+                        node.args[0],
+                        arg,
+                        node.args[2],
+                        node.args[3],
+                        node.args[4],
+                    )
+
+                    module.graph.erase_node(copy_node)
+
+                    result = (
+                        arg,
+                        copy_node,
+                        prepare_weight_cache_default_node,
+                    )
+                    results.append(result)
+
+                    copy_node = None
+                    prepare_weight_cache_default_node = None
+
+        return results
+
+
+class NpuAddRmsNormQuantFuse:
+    def pattern(rms_norm_input, residual, rms_norm_weight, scale, offset, v1, v2, v3):
+        output = torch.ops.npu.npu_add_rms_norm(
+            rms_norm_input, residual, rms_norm_weight, 1e-6
+        )
+        out0 = output[0]
+        out2 = output[2]
+        quantized_output = torch.ops.npu.npu_quantize(out0, scale, offset, v1, v2, v3)
+        return quantized_output, out2
+
+    def replacement(
+        rms_norm_input, residual, rms_norm_weight, scale, offset, v1, v2, v3
+    ):
+        output = torch.ops.npu.npu_add_rms_norm_quant(
+            rms_norm_input, residual, rms_norm_weight, 1.0 / scale, offset, epsilon=1e-6
+        )
+        quantized_output = output[0]
+        out2 = output[2]
+        return quantized_output, out2