diff --git a/components/src/dynamo/sglang/main.py b/components/src/dynamo/sglang/main.py index 5fcea0152e..cc76e2ac80 100644 --- a/components/src/dynamo/sglang/main.py +++ b/components/src/dynamo/sglang/main.py @@ -103,11 +103,8 @@ async def init(runtime: DistributedRuntime, config: Config): server_args, dynamo_args = config.server_args, config.dynamo_args # Prevent SGLang from blocking on non-leader nodes - # We can switch this to 0 and leverage our own metrics - # after https://github.com/sgl-project/sglang/pull/13686 - # is merged in if server_args.node_rank >= 1: - os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "1" + os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0" engine = sgl.Engine(server_args=server_args) @@ -222,11 +219,8 @@ async def init_prefill(runtime: DistributedRuntime, config: Config): server_args, dynamo_args = config.server_args, config.dynamo_args # Prevent SGLang from blocking on non-leader nodes - # We can switch this to 0 and leverage our own metrics - # after https://github.com/sgl-project/sglang/pull/13686 - # is merged in if server_args.node_rank >= 1: - os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "1" + os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0" engine = sgl.Engine(server_args=server_args) diff --git a/components/src/dynamo/sglang/request_handlers/handler_base.py b/components/src/dynamo/sglang/request_handlers/handler_base.py index 4d4472e19a..9e47268ac1 100644 --- a/components/src/dynamo/sglang/request_handlers/handler_base.py +++ b/components/src/dynamo/sglang/request_handlers/handler_base.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio +import base64 +import json import logging import random import socket @@ -10,6 +12,7 @@ from typing import Any, AsyncGenerator, Dict, Optional, Tuple import sglang as sgl +from sglang.srt.tracing import trace as sglang_trace from sglang.srt.utils import get_local_ip_auto from dynamo._core import Client, Component, Context @@ -49,6 +52,7 @@ def __init__( self.prefill_client = prefill_client self.serving_mode = config.serving_mode self.skip_tokenizer_init = config.server_args.skip_tokenizer_init + self.enable_trace = config.server_args.enable_trace @abstractmethod async def generate(self, request: Dict[str, Any], context: Context): @@ -117,6 +121,39 @@ def _get_bootstrap_info(engine: sgl.Engine) -> Tuple[str, int]: return bootstrap_host, bootstrap_port + def _propagate_trace_context_to_sglang( + self, context: Context, bootstrap_room: int = 0 + ): + """Propagate Dynamo's trace context to SGLang for distributed tracing. SGLang expects a certain + format derived by loooking at https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/tracing/trace.py + in the to_dict() method. + + Args: + context: Dynamo Context object containing trace information. + bootstrap_room: Bootstrap room ID (0 for aggregated, actual room for disaggregated). + """ + trace_id = context.trace_id + span_id = context.span_id + if not trace_id or not span_id: + return + + # Build trace context for SGLang + trace_context = { + str(bootstrap_room): { + "root_span": {"traceparent": f"00-{trace_id}-{span_id}-01"}, + "prev_span": { + "span_id": int(span_id, 16), + "trace_id": int(trace_id, 16), + }, + } + } + + # Encode and propagate + base64_context = base64.b64encode( + json.dumps(trace_context, ensure_ascii=False).encode("utf-8") + ).decode("utf-8") + sglang_trace.trace_set_remote_propagate_context(base64_context) + async def _handle_cancellation( self, request_id_future: asyncio.Future, context: Context ): diff --git a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py index e7fd9f17ae..47572e2f54 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py @@ -112,6 +112,7 @@ async def generate( RuntimeError: If no bootstrap info received from prefill worker. """ logging.debug(f"New Request ID: {context.id()}") + trace_id = context.trace_id sampling_params = self._build_sampling_params(request) input_param = self._get_input_param(request) @@ -154,6 +155,11 @@ async def generate( if not bootstrap_info: raise RuntimeError("No bootstrap info received from prefill worker") + if self.enable_trace: + self._propagate_trace_context_to_sglang( + context, bootstrap_info["bootstrap_room"] + ) + decode = await self.engine.async_generate( **input_param, sampling_params=sampling_params, @@ -161,6 +167,7 @@ async def generate( bootstrap_host=bootstrap_info["bootstrap_host"], bootstrap_port=bootstrap_info["bootstrap_port"], bootstrap_room=bootstrap_info["bootstrap_room"], + rid=trace_id, ) if self.skip_tokenizer_init: @@ -170,10 +177,14 @@ async def generate( async for out in self._process_text_stream(decode, context): yield out else: + if self.enable_trace: + self._propagate_trace_context_to_sglang(context) + agg = await self.engine.async_generate( **input_param, sampling_params=sampling_params, stream=True, + rid=trace_id, ) if self.skip_tokenizer_init: async for out in self._process_token_stream(agg, context): diff --git a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py index dc55ab9762..e019ea5c9e 100644 --- a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py +++ b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py @@ -64,6 +64,7 @@ async def generate( Bootstrap info dict with host, port, and room for decode worker connection. """ logging.debug(f"New Request ID: {context.id()}") + trace_id = context.trace_id bootstrap_room = self._generate_bootstrap_room() bootstrap_info = { @@ -76,6 +77,10 @@ async def generate( input_param = self._get_input_param(request["request"]) + # Propagate trace context to SGLang + if self.enable_trace: + self._propagate_trace_context_to_sglang(context, bootstrap_room) + results = await self.engine.async_generate( **input_param, sampling_params=request["sampling_params"], @@ -83,6 +88,7 @@ async def generate( bootstrap_host=self.bootstrap_host, bootstrap_port=self.bootstrap_port, bootstrap_room=bootstrap_room, + rid=trace_id, ) task = asyncio.create_task(self._consume_results(results, context)) diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang index e6b1821e4a..5315352cc3 100644 --- a/container/Dockerfile.sglang +++ b/container/Dockerfile.sglang @@ -66,7 +66,7 @@ ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee ARG DEEPEP_GB_COMMIT=1b14ad661c7640137fcfe93cccb2694ede1220b0 ARG CMAKE_BUILD_PARALLEL_LEVEL=2 ARG SGL_KERNEL_VERSION=0.3.16.post5 -ARG SGLANG_COMMIT=0.5.4.post3 +ARG SGLANG_COMMIT=0.5.6 ARG GDRCOPY_COMMIT=v2.4.4 ARG NVSHMEM_VERSION=3.3.9 ARG GRACE_BLACKWELL=false diff --git a/deploy/observability/tempo.yaml b/deploy/observability/tempo.yaml index d5656245ee..a150aca64c 100644 --- a/deploy/observability/tempo.yaml +++ b/deploy/observability/tempo.yaml @@ -9,7 +9,7 @@ distributor: otlp: protocols: grpc: - endpoint: 0.0.0.0:4317 + endpoint: 0.0.0.0:4317 # Receives from OTEL collector http: endpoint: 0.0.0.0:4318 diff --git a/examples/backends/sglang/launch/agg.sh b/examples/backends/sglang/launch/agg.sh index 9ccb48f260..43e4f1f4af 100755 --- a/examples/backends/sglang/launch/agg.sh +++ b/examples/backends/sglang/launch/agg.sh @@ -46,10 +46,12 @@ while [[ $# -gt 0 ]]; do done # Enable tracing if requested +TRACE_ARGS=() if [ "$ENABLE_OTEL" = true ]; then export DYN_LOGGING_JSONL=true export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} + TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) fi # run ingress @@ -59,7 +61,7 @@ python3 -m dynamo.frontend & DYNAMO_PID=$! # run worker with metrics enabled -DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ +OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \ python3 -m dynamo.sglang \ --model-path "$MODEL" \ --served-model-name "$MODEL" \ @@ -68,4 +70,5 @@ python3 -m dynamo.sglang \ --trust-remote-code \ --skip-tokenizer-init \ --enable-metrics \ + "${TRACE_ARGS[@]}" \ "${EXTRA_ARGS[@]}" diff --git a/examples/backends/sglang/launch/agg_embed.sh b/examples/backends/sglang/launch/agg_embed.sh index 9064273f30..e78ebb2458 100755 --- a/examples/backends/sglang/launch/agg_embed.sh +++ b/examples/backends/sglang/launch/agg_embed.sh @@ -37,10 +37,12 @@ while [[ $# -gt 0 ]]; do done # Enable tracing if requested +TRACE_ARGS=() if [ "$ENABLE_OTEL" = true ]; then export DYN_LOGGING_JSONL=true export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} + TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) fi # run ingress @@ -59,4 +61,5 @@ python3 -m dynamo.sglang \ --tp 1 \ --trust-remote-code \ --use-sglang-tokenizer \ - --enable-metrics + --enable-metrics \ + "${TRACE_ARGS[@]}" diff --git a/examples/backends/sglang/launch/agg_router.sh b/examples/backends/sglang/launch/agg_router.sh index 0b336f5f15..4cfca011f4 100755 --- a/examples/backends/sglang/launch/agg_router.sh +++ b/examples/backends/sglang/launch/agg_router.sh @@ -37,10 +37,12 @@ while [[ $# -gt 0 ]]; do done # Enable tracing if requested +TRACE_ARGS=() if [ "$ENABLE_OTEL" = true ]; then export DYN_LOGGING_JSONL=true export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} + TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) fi # run ingress @@ -58,7 +60,8 @@ python3 -m dynamo.sglang \ --tp 1 \ --trust-remote-code \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' \ - --enable-metrics & + --enable-metrics \ + "${TRACE_ARGS[@]}" & WORKER_PID=$! OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \ @@ -69,4 +72,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ --tp 1 \ --trust-remote-code \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}' \ - --enable-metrics + --enable-metrics \ + "${TRACE_ARGS[@]}" diff --git a/examples/backends/sglang/launch/disagg.sh b/examples/backends/sglang/launch/disagg.sh index c4d14fcb52..9291ffb0c8 100755 --- a/examples/backends/sglang/launch/disagg.sh +++ b/examples/backends/sglang/launch/disagg.sh @@ -37,10 +37,12 @@ while [[ $# -gt 0 ]]; do done # Enable tracing if requested +TRACE_ARGS=() if [ "$ENABLE_OTEL" = true ]; then export DYN_LOGGING_JSONL=true export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} + TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) fi # run ingress @@ -65,7 +67,8 @@ python3 -m dynamo.sglang \ --host 0.0.0.0 \ --port 40000 \ --disaggregation-transfer-backend nixl \ - --enable-metrics --log-level debug & + --enable-metrics \ + "${TRACE_ARGS[@]}" & PREFILL_PID=$! # run decode worker @@ -81,4 +84,5 @@ CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang \ --disaggregation-bootstrap-port 12345 \ --host 0.0.0.0 \ --disaggregation-transfer-backend nixl \ - --enable-metrics --log-level debug + --enable-metrics \ + "${TRACE_ARGS[@]}" diff --git a/examples/backends/sglang/launch/disagg_router.sh b/examples/backends/sglang/launch/disagg_router.sh index 916cbbf410..16a7db750e 100755 --- a/examples/backends/sglang/launch/disagg_router.sh +++ b/examples/backends/sglang/launch/disagg_router.sh @@ -38,10 +38,12 @@ while [[ $# -gt 0 ]]; do done # Enable tracing if requested +TRACE_ARGS=() if [ "$ENABLE_OTEL" = true ]; then export DYN_LOGGING_JSONL=true export OTEL_EXPORT_ENABLED=1 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317} + TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317) fi # run ingress @@ -74,7 +76,8 @@ python3 -m dynamo.sglang \ --host 0.0.0.0 \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' \ --disaggregation-transfer-backend nixl \ - --enable-metrics & + --enable-metrics \ + "${TRACE_ARGS[@]}" & PREFILL_PID=$! # run prefill worker @@ -89,7 +92,8 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ --host 0.0.0.0 \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}' \ --disaggregation-transfer-backend nixl \ - --enable-metrics & + --enable-metrics \ + "${TRACE_ARGS[@]}" & PREFILL_PID=$! # run decode worker @@ -104,7 +108,8 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \ --host 0.0.0.0 \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5560"}' \ --disaggregation-transfer-backend nixl \ - --enable-metrics & + --enable-metrics \ + "${TRACE_ARGS[@]}" & PREFILL_PID=$! # run decode worker @@ -119,4 +124,5 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \ --host 0.0.0.0 \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559"}' \ --disaggregation-transfer-backend nixl \ - --enable-metrics + --enable-metrics \ + "${TRACE_ARGS[@]}"