Choose latencies randomly (#103)

irar2 · web-flow · commit 7f1f76642832 · 2025-07-20T11:12:06.000+03:00
* Choose latencies randomly

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Improved code readability

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

---------

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;
diff --git a/README.md b/README.md
@@ -98,8 +98,11 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `echo`: returns the same text that was sent in the request
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
+- `time-to-first-token-std-dev`: standard deviation for time before the first token will be returned, in milliseconds, optional, default is 0, can't be more than 30% of `time-to-first-token`, will not cause the actual time to first token to differ by more than 70% from `time-to-first-token`
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
+- `inter-token-latency-std-dev`: standard deviation for time between generated tokens, in milliseconds, optional, default is 0, can't be more than 30% of `inter-token-latency`, will not cause the actual inter token latency to differ by more than 70% from `inter-token-latency`
 - `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
+- `kv-cache-transfer-latency-std-dev`: standard deviation for time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more than 30% of `kv-cache-transfer-latency`, will not cause the actual latency to differ by more than 70% from `kv-cache-transfer-latency`
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
 - `max-tool-call-integer-param`: the maximum possible value of integer parameters in a tool call, optional, defaults to 100
 - `min-tool-call-integer-param`: the minimum possible value of integer parameters in a tool call, optional, defaults to 0
diff --git a/pkg/llm-d-inference-sim/config.go b/pkg/llm-d-inference-sim/config.go
@@ -51,10 +51,24 @@ type configuration struct {
 
 	// TimeToFirstToken time before the first token will be returned, in milliseconds
 	TimeToFirstToken int `yaml:"time-to-first-token"`
+	// TimeToFirstTokenStdDev standard deviation for time before the first token will be returned,
+	// in milliseconds, optional, default is 0, can't be more than 30% of TimeToFirstToken, will not
+	// cause the actual time to first token to differ by more than 70% from TimeToFirstToken
+	TimeToFirstTokenStdDev int `yaml:"time-to-first-token-std-dev"`
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency"`
-	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
+	// InterTokenLatencyStdDev standard deviation for time between generated tokens, in milliseconds,
+	// optional, default is 0, can't be more than 30% of InterTokenLatency, will not cause the actual
+	// inter token latency to differ by more than 70% from InterTokenLatency
+	InterTokenLatencyStdDev int `yaml:"inter-token-latency-std-dev"`
+	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated,
+	// in milliseconds
 	KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
+	// KVCacheTransferLatencyStdDev standard deviation for time to "transfer" kv-cache from another
+	// vLLM instance in case P/D is activated, in milliseconds, optional, default is 0, can't be more
+	// than 30% of KVCacheTransferLatency, will not cause the actual latency to differ by more than 70% from
+	// KVCacheTransferLatency
+	KVCacheTransferLatencyStdDev int `yaml:"kv-cache-transfer-latency-std-dev"`
 
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode"`
@@ -178,12 +192,30 @@ func (c *configuration) validate() error {
 	if c.InterTokenLatency < 0 {
 		return errors.New("inter token latency cannot be negative")
 	}
+	if c.InterTokenLatencyStdDev < 0 {
+		return errors.New("inter token latency standard deviation cannot be negative")
+	}
+	if float32(c.InterTokenLatencyStdDev) > 0.3*float32(c.InterTokenLatency) {
+		return errors.New("inter token latency standard deviation cannot be more than 30% of inter token latency")
+	}
 	if c.TimeToFirstToken < 0 {
 		return errors.New("time to first token cannot be negative")
 	}
+	if c.TimeToFirstTokenStdDev < 0 {
+		return errors.New("time to first token standard deviation cannot be negative")
+	}
+	if float32(c.TimeToFirstTokenStdDev) > 0.3*float32(c.TimeToFirstToken) {
+		return errors.New("time to first token standard deviation cannot be more than 30% of time to first token")
+	}
 	if c.KVCacheTransferLatency < 0 {
 		return errors.New("kv-cache tranfer time cannot be negative")
 	}
+	if c.KVCacheTransferLatencyStdDev < 0 {
+		return errors.New("kv-cache tranfer time standard deviation cannot be negative")
+	}
+	if float32(c.KVCacheTransferLatencyStdDev) > 0.3*float32(c.KVCacheTransferLatency) {
+		return errors.New("kv-cache tranfer standard deviation cannot be more than 30% of kv-cache tranfer")
+	}
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
diff --git a/pkg/llm-d-inference-sim/config_test.go b/pkg/llm-d-inference-sim/config_test.go
@@ -258,6 +258,36 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--object-tool-call-not-required-field-probability", "1210",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid time-to-first-token-std-dev",
+			args: []string{"cmd", "--time-to-first-token-std-dev", "3000",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) time-to-first-token-std-dev",
+			args: []string{"cmd", "--time-to-first-token-std-dev", "10", "--time-to-first-token-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid inter-token-latency-std-dev",
+			args: []string{"cmd", "--inter-token-latency", " 1000", "--inter-token-latency-std-dev", "301",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) inter-token-latency-std-dev",
+			args: []string{"cmd", "--inter-token-latency", " 1000", "--inter-token-latency-std-dev", "-1",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid kv-cache-transfer-latency-std-dev",
+			args: []string{"cmd", "--kv-cache-transfer-latency", "70", "--kv-cache-transfer-latency-std-dev", "35",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid (negative) kv-cache-transfer-latency-std-dev",
+			args: []string{"cmd", "--kv-cache-transfer-latency-std-dev", "-35",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -158,6 +158,9 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
 	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
+	f.IntVar(&config.InterTokenLatencyStdDev, "inter-token-latency-std-dev", config.InterTokenLatencyStdDev, "Standard deviation for time between generated tokens (in milliseconds)")
+	f.IntVar(&config.TimeToFirstTokenStdDev, "time-to-first-token-std-dev", config.TimeToFirstTokenStdDev, "Standard deviation for time before the first token will be returned (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferLatencyStdDev, "kv-cache-transfer-latency-std-dev", config.KVCacheTransferLatencyStdDev, "Standard deviation for time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
 
 	f.IntVar(&config.MaxToolCallIntegerParam, "max-tool-call-integer-param", config.MaxToolCallIntegerParam, "Maximum possible value of integer parameters in a tool call")
@@ -674,7 +677,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	numOfTokens := usageData.CompletionTokens
-	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + (numOfTokens-1)*s.config.InterTokenLatency
+	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	// TODO - maybe add pod id to response header for testing
@@ -687,10 +690,29 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 // returns time to first token based on the current request's doRemotePrefill
 func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
+	mean := float64(s.config.TimeToFirstToken)
+	stddev := float64(s.config.TimeToFirstTokenStdDev)
 	if doRemotePrefill {
-		return s.config.KVCacheTransferLatency
+		mean = float64(s.config.KVCacheTransferLatency)
+		stddev = float64(s.config.KVCacheTransferLatencyStdDev)
 	}
-	return s.config.TimeToFirstToken
+	return int(randomNorm(mean, stddev))
+}
+
+// returns inter token latency
+func (s *VllmSimulator) getInterTokenLatency() int {
+	mean := float64(s.config.InterTokenLatency)
+	stddev := float64(s.config.InterTokenLatencyStdDev)
+	return int(randomNorm(mean, stddev))
+}
+
+// returns total inter token latency for the given number of tokens
+func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int {
+	total := 0
+	for range numOfTokens - 1 {
+		total += s.getInterTokenLatency()
+	}
+	return total
 }
 
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -489,4 +489,84 @@ var _ = Describe("Simulator", func() {
 			Expect(string(body)).To(ContainSubstring("BadRequestError"))
 		})
 	})
+
+	Describe("Check random latencies", Ordered, func() {
+		var simulator *VllmSimulator
+
+		BeforeAll(func() {
+			var err error
+			simulator, err = New(klog.Background())
+			Expect(err).NotTo(HaveOccurred())
+
+			simulator.config = newConfig()
+			simulator.config.TimeToFirstToken = 2048
+			simulator.config.TimeToFirstTokenStdDev = 2048
+			simulator.config.KVCacheTransferLatency = 2048
+			simulator.config.KVCacheTransferLatencyStdDev = 2048
+		})
+
+		DescribeTable("should calculate inter token latency correctly",
+			func(interTokenLatency int, stddev int) {
+				simulator.config.InterTokenLatency = interTokenLatency
+				simulator.config.InterTokenLatencyStdDev = stddev
+				interToken := simulator.getInterTokenLatency()
+				Expect(interToken).To(BeNumerically(">=", float32(interTokenLatency)*0.3))
+				Expect(interToken).To(BeNumerically("<=", float32(interTokenLatency)*1.7))
+			},
+			func(interTokenLatency int, stddev int) string {
+				return fmt.Sprintf("interTokenLatency: %d stddev: %d", interTokenLatency, stddev)
+			},
+			Entry(nil, 1000, 300),
+			Entry(nil, 1000, 800), // invalid std dev, used for testing purposes
+			Entry(nil, 1000, 900), // invalid std dev, used for testing purposes
+			Entry(nil, 1000, 0),
+		)
+
+		DescribeTable("should calculate total inter token latency correctly",
+			func(interTokenLatency int, stddev int, numberOfTokens int) {
+				simulator.config.InterTokenLatency = interTokenLatency
+				simulator.config.InterTokenLatencyStdDev = stddev
+				latency := simulator.getTotalInterTokenLatency(numberOfTokens)
+				Expect(latency).To(BeNumerically(">=", float32(interTokenLatency)*0.3*float32(numberOfTokens)))
+				Expect(latency).To(BeNumerically("<=", float32(interTokenLatency)*1.7*float32(numberOfTokens)))
+			},
+			func(interTokenLatency int, stddev int, numberOfTokens int) string {
+				return fmt.Sprintf("interTokenLatency: %d stddev: %d, numberOfTokens: %d", interTokenLatency,
+					stddev, numberOfTokens)
+			},
+			Entry(nil, 1000, 30, 100),
+			Entry(nil, 1000, 800, 20), // invalid std dev, used for testing purposes
+			Entry(nil, 1000, 900, 5),  // invalid std dev, used for testing purposes
+			Entry(nil, 1000, 0, 50),
+		)
+
+		DescribeTable("should calculate time to first token correctly",
+			func(timeToFirstToken int, timeToFirstTokenStdDev int,
+				kvCacheLatency int, kvCacheLatencyStdDev int, doREmotePrefill bool) {
+				simulator.config.TimeToFirstToken = timeToFirstToken
+				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
+				simulator.config.KVCacheTransferLatency = kvCacheLatency
+				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
+				timeToFirst := simulator.getTimeToFirstToken(doREmotePrefill)
+				if doREmotePrefill {
+					Expect(timeToFirst).To(BeNumerically(">=", float32(kvCacheLatency)*0.3))
+					Expect(timeToFirst).To(BeNumerically("<=", float32(kvCacheLatency)*1.7))
+				} else {
+					Expect(timeToFirst).To(BeNumerically(">=", float32(timeToFirstToken)*0.3))
+					Expect(timeToFirst).To(BeNumerically("<=", float32(timeToFirstToken)*1.7))
+				}
+			},
+			func(timeToFirstToken int, timeToFirstTokenStdDev int,
+				kvCacheLatency int, kvCacheLatencyStdDev int, doREmotePrefill bool) string {
+				return fmt.Sprintf("timeToFirstToken: %d stddev: %d kvCacheLatency: %d stddev: %d doREmotePrefill: %t",
+					timeToFirstToken, timeToFirstTokenStdDev, kvCacheLatency, kvCacheLatencyStdDev, doREmotePrefill)
+			},
+			Entry(nil, 10000, 300, 1000, 200, true),
+			Entry(nil, 10000, 300, 1000, 200, false),
+			Entry(nil, 10000, 9000, 1000, 800, true),  // invalid std dev, used for testing purposes
+			Entry(nil, 10000, 8000, 1000, 900, false), // invalid std dev, used for testing purposes
+			Entry(nil, 10000, 0, 1000, 0, true),
+			Entry(nil, 10000, 0, 1000, 0, false),
+		)
+	})
 })
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -91,7 +91,7 @@ func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writ
 
 	for i, token := range tokens {
 		if i != 0 {
-			time.Sleep(time.Duration(s.config.InterTokenLatency) * time.Millisecond)
+			time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond)
 		}
 		var toolChunkInsert *toolCall
 		if tc != nil {
diff --git a/pkg/llm-d-inference-sim/utils.go b/pkg/llm-d-inference-sim/utils.go
@@ -151,6 +151,22 @@ func randomFloat(min float64, max float64) float64 {
 	return randomGenerator.Float64()*(max-min) + min
 }
 
+// Returns a normally distributed float64
+// If the generated value differs by more than 70% from mean, the returned
+// value will be 70% of mean
+func randomNorm(mean float64, stddev float64) float64 {
+	if stddev == 0 {
+		return mean
+	}
+	value := randomGenerator.NormFloat64()*stddev + mean
+	if value < 0.3*mean {
+		value = 0.3 * mean
+	} else if value > 1.7*mean {
+		value = 1.7 * mean
+	}
+	return value
+}
+
 // Regular expression for the response tokenization
 var re *regexp.Regexp
 

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ func (s VllmSimulator) sendTokenChunks(context streamingContext, w *bufio.Writ`
`91`	`91`
`92`	`92`	`for i, token := range tokens {`
`93`	`93`	`if i != 0 {`
`94`		`- time.Sleep(time.Duration(s.config.InterTokenLatency) * time.Millisecond)`
	`94`	`+ time.Sleep(time.Duration(s.getInterTokenLatency()) * time.Millisecond)`
`95`	`95`	`}`
`96`	`96`	`var toolChunkInsert *toolCall`
`97`	`97`	`if tc != nil {`