Revise compilation and pass_manager

luoyuan.luo · luoyuan.luo · commit 2baee2a4c4f6 · 2025-11-30T20:50:01.000+08:00
diff --git a/python/sglang/srt/compilation/backend.py b/python/sglang/srt/compilation/backend.py
@@ -89,21 +89,24 @@ def load(
         graph_index: int,
         runtime_shape: Optional[int] = None,
     ) -> Optional[Callable]:
-        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        key = (runtime_shape, graph_index, self.compiler.name)
+        handle = self.cache.get(key, None)
+        if handle is None:
+            return None
+
         compiled_graph = self.compiler.load(
             handle, graph, example_inputs, graph_index, runtime_shape
         )
         if runtime_shape is None:
             logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via "
-                "handle %s",
+                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
                 graph_index,
                 self.compiler.name,
                 handle,
             )
         else:
             logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via " "handle %s",
+                "Directly load the %s-th graph for shape %s from %s via handle %s",
                 graph_index,
                 str(runtime_shape),
                 self.compiler.name,
@@ -299,7 +302,7 @@ def __init__(
         # When True, it annoyingly dumps the torch.fx.Graph on errors.
         self.extra_traceback = False
         self.sglang_config = sglang_config
-        self.compilation_config = sglang_config.compile_config
+        self.compilation_config = sglang_config.compilation_config
 
     def run(self, *args):
         fake_args = [
@@ -329,7 +332,7 @@ def call_module(
                 self.sglang_backend.compiler_manager.compile(
                     submod,
                     args,
-                    self.inductor_config,
+                    self.sglang_backend.inductor_config,
                     self.compilation_config,
                     graph_index=index,
                     num_graphs=len(self.compile_submod_names),
@@ -340,7 +343,7 @@ def call_module(
             self.module.__dict__[target] = CUDAPiecewiseBackend(
                 submod,
                 self.compilation_config,
-                self.inductor_config,
+                self.sglang_backend.inductor_config,
                 self.graph_pool,
                 index,
                 len(self.compile_submod_names),
diff --git a/python/sglang/srt/compilation/fix_functionalization.py b/python/sglang/srt/compilation/fix_functionalization.py
@@ -9,7 +9,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 
 from sglang.srt.compilation.fx_utils import is_func
-from sglang.srt.compilation.inductor_pass import SGLangInductorPass
+from sglang.srt.compilation.sglang_inductor_pass import SGLangInductorPass
 
 logger = logging.getLogger(__name__)
 
diff --git a/python/sglang/srt/compilation/inductor_pass.py b/python/sglang/srt/compilation/inductor_pass.py
@@ -5,14 +5,12 @@
 import inspect
 import json
 import logging
-import time
 import types
 from contextlib import contextmanager
 from typing import Any, Callable, Optional, Union
 
 import torch
 from torch import fx
-from torch._dynamo.utils import lazy_format_graph_code
 from torch._inductor.custom_graph_pass import CustomGraphPass
 from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
 
@@ -113,35 +111,6 @@ def uuid(self) -> Any:
         return self._uuid
 
 
-class SGLangInductorPass(InductorPass):
-
-    def __init__(
-        self,
-    ):
-        self.pass_name = self.__class__.__name__
-
-    def dump_graph(self, graph: torch.fx.Graph, stage: str):
-        lazy_format_graph_code(stage, graph.owning_module)
-
-    def begin(self):
-        self._start_time = time.perf_counter_ns()
-
-    def end_and_log(self):
-        self._end_time = time.perf_counter_ns()
-        duration_ms = float(self._end_time - self._start_time) / 1.0e6
-        logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
-
-
-class PrinterInductorPass(SGLangInductorPass):
-
-    def __init__(self, name: str):
-        super().__init__()
-        self.name = name
-
-    def __call__(self, graph: torch.fx.Graph):
-        self.dump_graph(graph, self.name)
-
-
 def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]:
     """
     Applies a FakeTensorMode context. This is useful when you don't want to
diff --git a/python/sglang/srt/compilation/pass_manager.py b/python/sglang/srt/compilation/pass_manager.py
@@ -9,9 +9,9 @@
 from sglang.srt.compilation.inductor_pass import (
     CustomGraphPass,
     InductorPass,
-    SGLangInductorPass,
     get_pass_context,
 )
+from sglang.srt.compilation.sglang_inductor_pass import SGLangInductorPass
 from sglang.srt.configs.sglang_config import SGLangConfig, set_current_sglang_config
 
 logger = logging.getLogger(__name__)
@@ -45,12 +45,12 @@ def __call__(self, graph: fx.Graph):
         self.fix_functionalization(graph)
 
     def configure(self, config: SGLangConfig):
-        # TODO(yuan-luo): PassConfig
-        self.pass_config = dict()
-        self.fix_functionalization = FixFunctionalizationPass()
+        self.pass_config = config.compilation_config.pass_config
 
         with set_current_sglang_config(config, check_compile=False):
-            self.passes += [AllReduceFusionPass(config)]
+            if self.pass_config.enable_fi_allreduce_fusion:
+                self.passes += [AllReduceFusionPass(config)]
+            self.fix_functionalization = FixFunctionalizationPass(config)
 
     def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
diff --git a/python/sglang/srt/configs/compilation_config.py b/python/sglang/srt/configs/compilation_config.py
@@ -25,6 +25,7 @@ class CompilationMode:
     shape specialization, and custom passes."""
 
 
+@dataclass
 class PassConfig:
     """Configuration for custom Inductor passes.
     This is separate from general `CompilationConfig` so that inductor passes
@@ -69,9 +70,20 @@ class CompilationConfig:
         certain small batchsizes, where inductor is good at optimizing.
     """
 
+    # Sizes to capture cudagraph.
+    # - None (default): capture sizes are inferred from sglang config.
+    # - list[int]: capture sizes are specified as given.
+    capture_sizes: List[int]
+
+    compiler: str = "eager"
+
+    enable_debug_mode: bool = False
+
     # Top-level Compilation control
     level: Optional[int] = None
 
+    mode: CompilationMode | None = None
+
     # The backend for compilation. It needs to be a string:
     # (empty string): use the default backend ("inductor" on CUDA-alike
     # platforms).
@@ -82,32 +94,32 @@ class CompilationConfig:
     # Inductor capture
     use_inductor: bool = True
 
+    """Fine-grained control over which custom ops to enable/disable. Use 'all'
+    to enable all, 'none' to disable all. Also specify a list of custom op
+    names to enable (prefixed with a '+'), or disable (prefixed with a '-').
+    Examples:
+
+    - 'all,-op1' to enable all except op1
+    - 'none,+op1,+op2' to enable only op1 and op2
+
+    By default, all custom ops are enabled when running without Inductor and
+    disabled when running with Inductor: mode>=SGLANG_COMPILE and backend="inductor".
+    Inductor generates (fused) Triton kernels for disabled custom ops."""
+    splitting_ops: list[str] | None = None
+
+    use_inductor_graph_partition: bool = False
+
     inductor_compile_config: dict = field(default_factory=dict)
 
     inductor_passes: dict[str, str] = field(default_factory=dict)
 
-    # Sizes to capture cudagraph.
-    # - None (default): capture sizes are inferred from sglang config.
-    # - list[int]: capture sizes are specified as given.
-    cudagraph_capture_sizes: list[int] | None = None
-
     pass_config: PassConfig = field(default_factory=PassConfig)
 
     # time taken for compilation
     compilation_time: float = field(default=0.0, init=False)
 
-    compiler: str = ""
-
-    def __init__(
-        self,
-        capture_sizes: List[int],
-        compiler: str = "eager",
-        enable_debug_mode: bool = False,
-    ):
-        self.traced_files = set()
-        self.capture_sizes = capture_sizes
-        self.compiler = compiler
-        self.enable_debug_mode = enable_debug_mode
-
     def get_capture_sizes(self):
         return self.capture_sizes
+
+    def get_enable_debug_mode(self):
+        return self.enable_debug_mode
diff --git a/python/sglang/srt/configs/sglang_config.py b/python/sglang/srt/configs/sglang_config.py
@@ -1,5 +1,6 @@
 import copy
 import logging
+from contextlib import contextmanager
 from dataclasses import replace
 from functools import lru_cache
 
@@ -47,6 +48,7 @@ def with_hf_config(
 _current_prefix: str | None = None
 
 
+@contextmanager
 def set_current_sglang_config(
     sglang_config: SGLangConfig, check_compile=False, prefix: str | None = None
 ):