Skip to content

Commit 10c2c7b

Browse files
krishung5zxue2
authored andcommitted
fix: Fix multimodal EPD examples for vllm version bump (ai-dynamo#4849)
1 parent f77fa70 commit 10c2c7b

File tree

15 files changed

+97
-46
lines changed

15 files changed

+97
-46
lines changed

components/src/dynamo/vllm/multimodal_handlers/processor_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from vllm.engine.arg_utils import AsyncEngineArgs
1212
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
1313
from vllm.outputs import RequestOutput
14-
from vllm.transformers_utils.tokenizer import AnyTokenizer
14+
from vllm.tokenizers import TokenizerLike as AnyTokenizer
1515

1616
from dynamo.runtime import Client
1717

components/src/dynamo/vllm/multimodal_utils/chat_processor.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,22 @@
2828
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
2929
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
3030
from vllm.entrypoints.openai.serving_engine import RequestPrompt
31+
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
3132
from vllm.inputs.data import TokensPrompt
3233
from vllm.sampling_params import SamplingParams
33-
from vllm.transformers_utils.tokenizer import AnyTokenizer
34+
from vllm.tokenizers import TokenizerLike as AnyTokenizer
35+
36+
37+
class StubEngineClient:
38+
"""
39+
Stub EngineClient for preprocessing-only use of OpenAIServingChat/Completion.
40+
Provides the minimal attributes required by OpenAIServingModels.
41+
"""
42+
43+
def __init__(self, model_config: ModelConfig):
44+
self.model_config = model_config
45+
self.input_processor = None
46+
self.io_processor = None
3447

3548

3649
@runtime_checkable
@@ -120,12 +133,19 @@ class ChatProcessor:
120133
def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
121134
self.tokenizer = tokenizer
122135
self.model_config = model_config
136+
# Create stub engine client and models for preprocessing-only usage
137+
stub_engine = StubEngineClient(model_config)
138+
serving_models = OpenAIServingModels(
139+
engine_client=stub_engine,
140+
base_model_paths=[
141+
BaseModelPath(name=model_config.model, model_path=model_config.model)
142+
],
143+
)
123144
self.openai_serving = OpenAIServingChat(
124-
engine_client=None,
125-
model_config=model_config,
126-
models=None,
127-
request_logger=None,
145+
engine_client=stub_engine,
146+
models=serving_models,
128147
response_role="assistant",
148+
request_logger=None,
129149
chat_template=None,
130150
chat_template_content_format="auto",
131151
)
@@ -186,7 +206,6 @@ async def stream_response(
186206
conversation,
187207
self.tokenizer,
188208
request_metadata,
189-
enable_force_include_usage=False,
190209
):
191210
if raw_response.startswith("data: [DONE]"):
192211
yield raw_response
@@ -220,7 +239,6 @@ async def stream_response(
220239
conversation,
221240
self.tokenizer,
222241
request_metadata,
223-
enable_force_include_usage=False,
224242
):
225243
if raw_response.startswith("data: [DONE]"):
226244
break
@@ -267,10 +285,17 @@ class CompletionsProcessor:
267285
def __init__(self, tokenizer: AnyTokenizer, model_config: ModelConfig):
268286
self.tokenizer = tokenizer
269287
self.model_config = model_config
288+
# Create stub engine client and models for preprocessing-only usage
289+
stub_engine = StubEngineClient(model_config)
290+
serving_models = OpenAIServingModels(
291+
engine_client=stub_engine,
292+
base_model_paths=[
293+
BaseModelPath(name=model_config.model, model_path=model_config.model)
294+
],
295+
)
270296
self.openai_serving = OpenAIServingCompletion(
271-
engine_client=None,
272-
model_config=model_config,
273-
models=None,
297+
engine_client=stub_engine,
298+
models=serving_models,
274299
request_logger=None,
275300
)
276301

components/src/dynamo/vllm/multimodal_utils/protocol.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401
2727
from vllm.outputs import CompletionOutput
2828
from vllm.sampling_params import SamplingParams
29-
from vllm.sequence import RequestMetrics
29+
from vllm.v1.metrics.stats import RequestStateStats
3030

3131
import dynamo.nixl_connect as connect
3232

@@ -156,7 +156,7 @@ class MyRequestOutput(BaseModel):
156156
https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
157157
158158
This class is used to serialize the RequestOutput and any recursively defined types
159-
We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
159+
We can do this because PromptLogprobs, RequestStateStats, and CompletionOutput are all serializable dataclasses
160160
"""
161161

162162
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -167,7 +167,7 @@ class MyRequestOutput(BaseModel):
167167
prompt_logprobs: Optional[PromptLogprobs] = None
168168
outputs: List[CompletionOutput]
169169
finished: bool
170-
metrics: Optional[RequestMetrics] = None
170+
metrics: Optional[RequestStateStats] = None
171171
kv_transfer_params: Optional[dict[str, Any]] = None
172172
# lora_request: Optional[LoRARequest] = None
173173
# encoder_prompt: Optional[str] = None

examples/backends/vllm/launch/agg_multimodal_epd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_
8080

8181
# run E/P/D workers
8282
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
83-
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
83+
CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
8484

8585
# Wait for all background processes to complete
8686
wait

examples/backends/vllm/launch/disagg_multimodal_epd.sh

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,23 +81,20 @@ python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_
8181

8282
# Configure GPU memory optimization for specific models
8383
EXTRA_ARGS=""
84-
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
85-
EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
86-
fi
8784

8885
# Start encode worker
89-
echo "Starting encode worker on GPU 1..."
90-
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
86+
echo "Starting encode worker on GPU 0..."
87+
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
9188

9289
# Start prefill worker
93-
echo "Starting prefill worker on GPU 2..."
90+
echo "Starting prefill worker on GPU 1..."
9491
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
95-
CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
92+
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
9693

9794
# Start decode worker
98-
echo "Starting decode worker on GPU 3..."
95+
echo "Starting decode worker on GPU 2..."
9996
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
100-
CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
97+
CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
10198

10299
echo "=================================================="
103100
echo "All components started. Waiting for initialization..."

examples/multimodal/components/audio_encode_worker.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
import uvloop
2626
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
2727
from vllm.engine.arg_utils import AsyncEngineArgs
28-
from vllm.utils import FlexibleArgumentParser
28+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2929

3030
import dynamo.nixl_connect as connect
3131
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
@@ -201,7 +201,6 @@ async def async_init(self, runtime: DistributedRuntime):
201201
# Create and initialize a dynamo connector for this worker.
202202
# We'll needs this to move data between this worker and remote workers efficiently.
203203
self._connector = connect.Connector()
204-
await self._connector.initialize()
205204

206205
logger.info("Startup completed.")
207206

examples/multimodal/components/encode_worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import uvloop
1313
from transformers import AutoImageProcessor
1414
from vllm.engine.arg_utils import AsyncEngineArgs
15-
from vllm.utils import FlexibleArgumentParser
15+
from vllm.utils.argparse_utils import FlexibleArgumentParser
1616

1717
import dynamo.nixl_connect as connect
1818
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker

examples/multimodal/components/processor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
from vllm.engine.arg_utils import AsyncEngineArgs
1818
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
1919
from vllm.outputs import RequestOutput
20-
from vllm.transformers_utils.tokenizer import AnyTokenizer
21-
from vllm.utils import FlexibleArgumentParser
20+
from vllm.tokenizers import TokenizerLike as AnyTokenizer
21+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2222

2323
from dynamo.llm import ModelInput, ModelType, register_llm
2424
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker

examples/multimodal/components/publisher.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ def record(
3838
scheduler_stats: Optional[SchedulerStats],
3939
iteration_stats: Optional[IterationStats],
4040
engine_idx: int = 0,
41+
*args,
42+
**kwargs,
4143
):
4244
pass
4345

@@ -74,6 +76,8 @@ def record(
7476
scheduler_stats: SchedulerStats,
7577
iteration_stats: Optional[IterationStats],
7678
engine_idx: int = 0,
79+
*args,
80+
**kwargs,
7781
):
7882
# request_total_slots and kv_total_blocks are properties of model + gpu
7983
# we should only publish them once, not every metric update

examples/multimodal/components/video_encode_worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import torch
1717
import uvloop
1818
from vllm.engine.arg_utils import AsyncEngineArgs
19-
from vllm.utils import FlexibleArgumentParser
19+
from vllm.utils.argparse_utils import FlexibleArgumentParser
2020

2121
import dynamo.nixl_connect as connect
2222
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker

0 commit comments

Comments
 (0)