Skip to content

Commit a76c479

Browse files
dmitry-tokarev-nvkaren-syalec-flowersjthomson04nv-anants
authored
chore: 0.7.1 + vllm 0.12.0 (#4824)
Signed-off-by: alec-flowers <[email protected]> Signed-off-by: Karen Chung <[email protected]> Signed-off-by: jthomson04 <[email protected]> Signed-off-by: Anant Sharma <[email protected]> Co-authored-by: Karen Chung <[email protected]> Co-authored-by: alec-flowers <[email protected]> Co-authored-by: jthomson04 <[email protected]> Co-authored-by: Anant Sharma <[email protected]> Co-authored-by: Kris Hung <[email protected]>
1 parent d7702b8 commit a76c479

File tree

39 files changed

+695
-240
lines changed

39 files changed

+695
-240
lines changed

.github/actions/pytest/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,4 @@ runs:
130130
path: |
131131
test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
132132
test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
133-
retention-days: 7
133+
retention-days: 7

components/src/dynamo/vllm/args.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,24 @@ def parse_args() -> Config:
192192
args = parser.parse_args()
193193
engine_args = AsyncEngineArgs.from_cli_args(args)
194194

195+
# Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
196+
# With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
197+
# process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
198+
# blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
199+
# processes, avoiding the GIL contention.
200+
# Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
201+
# and forcing mp can expose race conditions in vLLM's scheduler.
202+
# See: https://github.com/vllm-project/vllm/issues/29369
203+
connector_list = [c.lower() for c in args.connector] if args.connector else []
204+
uses_nixl = "nixl" in connector_list
205+
tp_size = getattr(engine_args, "tensor_parallel_size", None) or 1
206+
if uses_nixl and tp_size == 1 and engine_args.distributed_executor_backend is None:
207+
logger.info(
208+
"Setting --distributed-executor-backend=mp for TP=1 to avoid "
209+
"UniProcExecutor GIL contention with NIXL connector"
210+
)
211+
engine_args.distributed_executor_backend = "mp"
212+
195213
if engine_args.enable_prefix_caching is None:
196214
logger.debug(
197215
"--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"
@@ -443,7 +461,7 @@ def overwrite_args(config):
443461
# skip tokenizer initialisation. Setting this to **False** avoids
444462
# a NoneType error when the processor accesses the tokenizer.
445463
"skip_tokenizer_init": False,
446-
"disable_log_requests": True,
464+
"enable_log_requests": False,
447465
"disable_log_stats": False,
448466
}
449467

components/src/dynamo/vllm/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ def setup_vllm_engine(config, stat_logger=None):
328328
vllm_config=vllm_config,
329329
usage_context=usage_context,
330330
stat_loggers=factory,
331-
disable_log_requests=engine_args.disable_log_requests,
331+
enable_log_requests=engine_args.enable_log_requests,
332332
disable_log_stats=engine_args.disable_log_stats,
333333
)
334334
if ENABLE_LMCACHE:

components/src/dynamo/vllm/multimodal_handlers/processor_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from vllm.engine.arg_utils import AsyncEngineArgs
1212
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
1313
from vllm.outputs import RequestOutput
14-
from vllm.transformers_utils.tokenizer import AnyTokenizer
14+
from vllm.tokenizers import TokenizerLike as AnyTokenizer
1515

1616
from dynamo.runtime import Client
1717

components/src/dynamo/vllm/multimodal_utils/chat_processor.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,22 @@
2828
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
2929
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
3030
from vllm.entrypoints.openai.serving_engine import RequestPrompt
31+
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
3132
from vllm.inputs.data import TokensPrompt
3233
from vllm.sampling_params import SamplingParams
33-
from vllm.transformers_utils.tokenizer import AnyTokenizer
34+
from vllm.tokenizers import TokenizerLike as AnyTokenizer
35+
36+
37+
class StubEngineClient:
38+
"""
39+
Stub EngineClient for preprocessing-only use of OpenAIServingChat/Completion.
40+
Provides the minimal attributes required by OpenAIServingModels.
41+
"""
42+
43+
def __init__(self, model_config: ModelConfig):
44+
self.model_config = model_config
45+
self.input_processor = None
46+
self.io_processor = None
3447

3548

3649
@runtime_checkable
@@ -120,12 +133,19 @@ class ChatProcessor:
120133
def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
121134
self.tokenizer = tokenizer
122135
self.model_config = model_config
136+
# Create stub engine client and models for preprocessing-only usage
137+
stub_engine = StubEngineClient(model_config)
138+
serving_models = OpenAIServingModels(
139+
engine_client=stub_engine,
140+
base_model_paths=[
141+
BaseModelPath(name=model_config.model, model_path=model_config.model)
142+
],
143+
)
123144
self.openai_serving = OpenAIServingChat(
124-
engine_client=None,
125-
model_config=model_config,
126-
models=None,
127-
request_logger=None,
145+
engine_client=stub_engine,
146+
models=serving_models,
128147
response_role="assistant",
148+
request_logger=None,
129149
chat_template=None,
130150
chat_template_content_format="auto",
131151
)
@@ -186,7 +206,6 @@ async def stream_response(
186206
conversation,
187207
self.tokenizer,
188208
request_metadata,
189-
enable_force_include_usage=False,
190209
):
191210
if raw_response.startswith("data: [DONE]"):
192211
yield raw_response
@@ -220,7 +239,6 @@ async def stream_response(
220239
conversation,
221240
self.tokenizer,
222241
request_metadata,
223-
enable_force_include_usage=False,
224242
):
225243
if raw_response.startswith("data: [DONE]"):
226244
break
@@ -267,10 +285,17 @@ class CompletionsProcessor:
267285
def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
268286
self.tokenizer = tokenizer
269287
self.model_config = model_config
288+
# Create stub engine client and models for preprocessing-only usage
289+
stub_engine = StubEngineClient(model_config)
290+
serving_models = OpenAIServingModels(
291+
engine_client=stub_engine,
292+
base_model_paths=[
293+
BaseModelPath(name=model_config.model, model_path=model_config.model)
294+
],
295+
)
270296
self.openai_serving = OpenAIServingCompletion(
271-
engine_client=None,
272-
model_config=model_config,
273-
models=None,
297+
engine_client=stub_engine,
298+
models=serving_models,
274299
request_logger=None,
275300
)
276301

components/src/dynamo/vllm/multimodal_utils/protocol.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401
2727
from vllm.outputs import CompletionOutput
2828
from vllm.sampling_params import SamplingParams
29-
from vllm.sequence import RequestMetrics
29+
from vllm.v1.metrics.stats import RequestStateStats
3030

3131
import dynamo.nixl_connect as connect
3232

@@ -156,7 +156,7 @@ class MyRequestOutput(BaseModel):
156156
https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
157157
158158
This class is used to serialize the RequestOutput and any recursively defined types
159-
We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
159+
We can do this because PromptLogprobs, RequestStateStats, and CompletionOutput are all serializable dataclasses
160160
"""
161161

162162
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -167,7 +167,7 @@ class MyRequestOutput(BaseModel):
167167
prompt_logprobs: Optional[PromptLogprobs] = None
168168
outputs: List[CompletionOutput]
169169
finished: bool
170-
metrics: Optional[RequestMetrics] = None
170+
metrics: Optional[RequestStateStats] = None
171171
kv_transfer_params: Optional[dict[str, Any]] = None
172172
# lora_request: Optional[LoRARequest] = None
173173
# encoder_prompt: Optional[str] = None

container/Dockerfile.vllm

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,17 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
1010
ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
1111
ARG ENABLE_KVBM=false
1212
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
13-
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
14-
ARG CUDA_VERSION="12.8"
13+
ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
14+
ARG CUDA_VERSION="12.9"
1515

1616
# Make sure to update the dependency version in pyproject.toml when updating this
17-
ARG VLLM_REF="v0.11.0"
18-
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
19-
ARG FLASHINF_REF="v0.3.1"
20-
ARG TORCH_BACKEND="cu128"
17+
ARG VLLM_REF="v0.12.0"
18+
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
19+
ARG FLASHINF_REF="v0.5.3"
2120

2221
# If left blank, then we will fallback to vLLM defaults
2322
ARG DEEPGEMM_REF=""
23+
ARG LMCACHE_REF="0.3.10"
2424

2525
# sccache configuration - inherit from base build
2626
ARG USE_SCCACHE
@@ -109,7 +109,7 @@ ARG VLLM_REF
109109
ARG VLLM_GIT_URL
110110
ARG DEEPGEMM_REF
111111
ARG FLASHINF_REF
112-
ARG TORCH_BACKEND
112+
ARG LMCACHE_REF
113113
ARG CUDA_VERSION
114114

115115
ARG MAX_JOBS=16
@@ -143,7 +143,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
143143
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
144144
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
145145
chmod +x /tmp/install_vllm.sh && \
146-
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
146+
/tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION && \
147147
/tmp/use-sccache.sh show-stats "vLLM";
148148

149149
ENV LD_LIBRARY_PATH=\
@@ -206,7 +206,7 @@ RUN apt-get update && \
206206
# prometheus dependencies
207207
ca-certificates \
208208
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
209-
cuda-command-line-tools-12-8 && \
209+
cuda-command-line-tools-12-9 && \
210210
rm -rf /var/lib/apt/lists/*
211211

212212
# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
@@ -287,8 +287,14 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
287287
--requirement /tmp/requirements.txt \
288288
--requirement /tmp/requirements.test.txt
289289

290-
# Copy benchmarks, examples, and tests for CI with correct ownership
291-
COPY --chown=dynamo: . /workspace/
290+
# Copy tests, benchmarks, deploy and components for CI
291+
COPY --chown=dynamo: benchmarks /workspace/benchmarks
292+
COPY --chown=dynamo: tests /workspace/tests
293+
COPY --chown=dynamo: examples /workspace/examples
294+
COPY --chown=dynamo: deploy /workspace/deploy
295+
COPY --chown=dynamo: recipes/ /workspace/recipes/
296+
COPY --chown=dynamo: components/ /workspace/components/
297+
COPY --chown=dynamo: lib/ /workspace/lib/
292298

293299
# Copy attribution files
294300
COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/
@@ -373,6 +379,7 @@ COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
373379

374380
# Install maturin, for maturin develop
375381
# Editable install of dynamo
382+
COPY pyproject.toml README.md hatch_build.py /workspace/
376383
RUN uv pip install maturin[patchelf] && \
377384
uv pip install --no-deps -e .
378385

container/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
106106
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
107107
# for details and reproducer to manually test if the image
108108
# can be updated to later versions.
109-
VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
109+
VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
110110

111111
NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
112112
NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

0 commit comments

Comments
 (0)