You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: README.md
+3Lines changed: 3 additions & 0 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -98,8 +98,11 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
98
98
-`echo`: returns the same text that was sent in the request
99
99
-`random`: returns a sentence chosen at random from a set of pre-defined sentences
100
100
-`time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
101
+
-`time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
101
102
-`inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
103
+
-`inter-token-latency-std-dev`: standard deviation for time between generated tokens, in milliseconds, optional, default is 0, can't be more than 30% of `inter-token-latency`, will not cause the actual inter token latency to differ by more than 70% from `inter-token-latency`
102
104
-`kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
105
+
-`kv-cache-transfer-latency-std-dev`: standard deviation for time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more than 30% of `kv-cache-transfer-latency`, will not cause the actual latency to differ by more than 70% from `kv-cache-transfer-latency`
103
106
-`seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
104
107
-`max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
105
108
-`min-tool-call-integer-param`: the minimum possible value of integer parameters in a tool call, optional, defaults to 0
Copy file name to clipboardExpand all lines: pkg/llm-d-inference-sim/simulator.go
+25-3Lines changed: 25 additions & 3 deletions
Original file line number
Diff line number
Diff line change
@@ -158,6 +158,9 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
158
158
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
159
159
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
160
160
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
161
+
f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
162
+
f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
163
+
f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
161
164
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
162
165
163
166
f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
func (s*VllmSimulator) getInterTokenLatency() int {
704
+
mean:=float64(s.config.InterTokenLatency)
705
+
stddev:=float64(s.config.InterTokenLatencyStdDev)
706
+
returnint(randomNorm(mean, stddev))
707
+
}
708
+
709
+
// returns total inter token latency for the given number of tokens
710
+
func (s*VllmSimulator) getTotalInterTokenLatency(numOfTokensint) int {
711
+
total:=0
712
+
forrangenumOfTokens-1 {
713
+
total+=s.getInterTokenLatency()
714
+
}
715
+
returntotal
694
716
}
695
717
696
718
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
0 commit comments