Skip to content

Commit b49b40b

Browse files
committed
feat(metrics): add generation_tokens_total and prompt_tokens_total metrics
Signed-off-by: CYJiang <[email protected]>
1 parent 5c58b12 commit b49b40b

File tree

5 files changed

+200
-0
lines changed

5 files changed

+200
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
3434
| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
3535
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
3636
| vllm:request_generation_tokens | Number of generation tokens processed |
37+
| vllm:generation_tokens_total | Total number of generated tokens. |
3738
| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
3839
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
3940
| vllm:request_prompt_tokens | Number of prefill tokens processed |
41+
| vllm:prompt_tokens_total | Total number of prompt tokens processed |
4042
| vllm:request_success_total | Count of successfully processed requests |
4143

4244
The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.

pkg/common/config.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,23 @@ type Metrics struct {
252252
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
253253
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
254254
// Each value will be passed to Observe() once at start-up.
255+
// exactly once during initialization. Additionally:
256+
// - The sum of RequestPromptTokens initializes vllm:prompt_tokens_total.
257+
// - The sum of RequestGenerationTokens initializes vllm:generation_tokens_total.
258+
// If TotalPromptTokens or TotalGenerationTokens are provided,
259+
// they override these sums and are used directly as the total token counts.
255260
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
256261
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
257262
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
258263
RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
259264
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
260265
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
261266

267+
// TotalPromptTokens is the total number of prompt tokens processed
268+
TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
269+
// TotalGenerationTokens is the total number of generated tokens
270+
TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`
271+
262272
// Latency histograms - have same buckets upper boundaries in seconds are:
263273
// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
264274
// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf

pkg/llm-d-inference-sim/metrics.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ const (
4444
generationTokensMetricName = "vllm:request_generation_tokens"
4545
paramMaxTokensMetricName = "vllm:request_params_max_tokens"
4646
promptTokensMetricName = "vllm:request_prompt_tokens"
47+
generationTokensTotalMetricName = "vllm:generation_tokens_total"
48+
promptTokensTotalMetricName = "vllm:prompt_tokens_total"
4749
successTotalMetricName = "vllm:request_success_total"
4850
loraRequestsMetricName = "vllm:lora_requests_info"
4951
reqRunningMetricName = "vllm:num_requests_running"
@@ -275,6 +277,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
275277
return err
276278
}
277279

280+
s.metrics.promptTokensTotal = prometheus.NewCounterVec(
281+
prometheus.CounterOpts{
282+
Subsystem: "",
283+
Name: promptTokensTotalMetricName,
284+
Help: "Total number of prompt tokens processed.",
285+
},
286+
[]string{vllmapi.PromLabelModelName},
287+
)
288+
289+
if err := s.metrics.registry.Register(s.metrics.promptTokensTotal); err != nil {
290+
s.logger.Error(err, "prometheus prompt_tokens_total counter register failed")
291+
return err
292+
}
293+
294+
s.metrics.generationTokensTotal = prometheus.NewCounterVec(
295+
prometheus.CounterOpts{
296+
Subsystem: "",
297+
Name: generationTokensTotalMetricName,
298+
Help: "Total number of generated tokens.",
299+
},
300+
[]string{vllmapi.PromLabelModelName},
301+
)
302+
303+
if err := s.metrics.registry.Register(s.metrics.generationTokensTotal); err != nil {
304+
s.logger.Error(err, "prometheus generation_tokens_total counter register failed")
305+
return err
306+
}
307+
278308
s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
279309
prometheus.CounterOpts{
280310
Subsystem: "",
@@ -325,9 +355,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.Gaug
325355
buckets := build125Buckets(s.config.MaxModelLen)
326356
if s.config.FakeMetrics.RequestPromptTokens != nil {
327357
s.initFakeHistogram(s.metrics.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens)
358+
var promptTotal int64
359+
if s.config.FakeMetrics.TotalPromptTokens != nil {
360+
promptTotal = *s.config.FakeMetrics.TotalPromptTokens
361+
} else {
362+
promptTotal = estimateTokenTotal(s.config.FakeMetrics.RequestPromptTokens, buckets)
363+
}
364+
s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTotal))
328365
}
329366
if s.config.FakeMetrics.RequestGenerationTokens != nil {
330367
s.initFakeHistogram(s.metrics.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens)
368+
var genTotal int64
369+
if s.config.FakeMetrics.TotalGenerationTokens != nil {
370+
genTotal = *s.config.FakeMetrics.TotalGenerationTokens
371+
} else {
372+
genTotal = estimateTokenTotal(s.config.FakeMetrics.RequestGenerationTokens, buckets)
373+
}
374+
s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(genTotal))
331375
}
332376
if s.config.FakeMetrics.RequestParamsMaxTokens != nil {
333377
s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens)
@@ -708,6 +752,8 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
708752
modelName := s.getDisplayedModelName(s.config.Model)
709753
s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
710754
s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
755+
s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTokens))
756+
s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(generationTokens))
711757
if maxTokens != nil {
712758
s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
713759
}
@@ -745,3 +791,34 @@ func build125Buckets(maxValue int) []float64 {
745791
}
746792
return buckets
747793
}
794+
795+
// estimateTokenTotal estimates the total number of tokens based on histogram bucket boundaries
796+
// and the number of requests in each bucket. It assumes that requests in a bucket have token
797+
// lengths uniformly distributed between the bucket's lower and upper bounds, and uses the
798+
// midpoint as a representative value for estimation.
799+
//
800+
// The last bucket is treated as [buckets[len(buckets)-1], +Inf), so its upper bound is approximated
801+
// as twice the lower bound for midpoint calculation.
802+
func estimateTokenTotal(counts []int, buckets []float64) int64 {
803+
if len(counts) == 0 || len(buckets) == 0 {
804+
return 0
805+
}
806+
var total int64
807+
n := len(buckets)
808+
for i, count := range counts {
809+
if count == 0 {
810+
continue
811+
}
812+
var lower, upper float64
813+
lower = buckets[i]
814+
if i+1 < n {
815+
upper = buckets[i+1]
816+
} else {
817+
// Approximate upper bound for the last (+Inf) bucket
818+
upper = lower * 2
819+
}
820+
mid := (lower + upper) / 2.0
821+
total += int64(float64(count) * mid)
822+
}
823+
return total
824+
}

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
162162
}
163163
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1)))
164164
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1)))
165+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 25`))
165166

166167
// request_generation_tokens
167168
// We do not verify the distribution of the number of tokens generated per request,
@@ -704,12 +705,46 @@ var _ = Describe("Simulator metrics", Ordered, func() {
704705
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount)))
705706
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount)))
706707
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount)))
708+
Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 310`))
709+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 310`))
707710

708711
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
709712
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
710713
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`))
711714
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`))
712715
})
716+
It("Should use TotalPromptTokens and TotalGenerationTokens if provided", func() {
717+
ctx := context.TODO()
718+
args := []string{
719+
"cmd", "--model", testModel, "--mode", common.ModeRandom,
720+
"--fake-metrics",
721+
`{` +
722+
`"running-requests":5,` +
723+
`"waiting-requests":2,` +
724+
`"kv-cache-usage":0.1,` +
725+
`"request-prompt-tokens":[100,200],` +
726+
`"request-generation-tokens":[50,150],` +
727+
`"total-prompt-tokens":12345,` + // explicit total
728+
`"total-generation-tokens":67890,` + // explicit total
729+
`"request-success-total":{"stop":10}` +
730+
`}`,
731+
}
732+
733+
client, err := startServerWithArgs(ctx, args)
734+
Expect(err).NotTo(HaveOccurred())
735+
736+
resp, err := client.Get(metricsUrl)
737+
Expect(err).NotTo(HaveOccurred())
738+
Expect(resp.StatusCode).To(Equal(http.StatusOK))
739+
740+
data, err := io.ReadAll(resp.Body)
741+
Expect(err).NotTo(HaveOccurred())
742+
metrics := string(data)
743+
744+
// Verify that the explicit totals are used
745+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 12345`))
746+
Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 67890`))
747+
})
713748
})
714749

715750
Context("fake ttft metrics", func() {
@@ -940,3 +975,75 @@ var _ = Describe("build125Buckets", Ordered, func() {
940975
}
941976
})
942977
})
978+
979+
var _ = Describe("estimateTokenTotal", func() {
980+
It("should correctly estimate total tokens from bucket counts and boundaries", func() {
981+
tests := []struct {
982+
name string
983+
counts []int
984+
buckets []float64
985+
expected int64
986+
}{
987+
{
988+
name: "empty counts",
989+
counts: []int{},
990+
buckets: []float64{1, 2, 5},
991+
expected: 0,
992+
},
993+
{
994+
name: "empty buckets",
995+
counts: []int{10, 20},
996+
buckets: []float64{},
997+
expected: 0,
998+
},
999+
{
1000+
name: "single bucket, single request",
1001+
counts: []int{1},
1002+
buckets: []float64{10},
1003+
expected: 15, // bucket [10, +Inf) → upper = 20, mid = (10+20)/2 = 15
1004+
},
1005+
{
1006+
name: "two buckets, exact midpoints",
1007+
counts: []int{2, 3},
1008+
buckets: []float64{0, 10, 20}, // buckets: [0,10), [10,20), [20,+Inf) — but only 2 counts → use first two
1009+
expected: int64(2*5 + 3*15), // (0+10)/2=5, (10+20)/2=15 → 10 + 45 = 55
1010+
},
1011+
{
1012+
name: "three buckets including last (+Inf)",
1013+
counts: []int{1, 1, 1},
1014+
buckets: []float64{10, 20, 50},
1015+
expected: int64(
1016+
1*((10+20)/2) + // 15
1017+
1*((20+50)/2) + // 35
1018+
1*((50+50*2)/2), // last bucket: upper = 50*2 = 100 → mid = (50+100)/2 = 75
1019+
), // 15 + 35 + 75 = 125
1020+
},
1021+
{
1022+
name: "zero counts in some buckets",
1023+
counts: []int{0, 5, 0, 2},
1024+
buckets: []float64{1, 10, 100, 1000},
1025+
expected: int64(
1026+
5*((10+100)/2) + // 5 * 55 = 275
1027+
2*((1000+1000*2)/2), // last bucket: 1000*2=2000 → mid=1500 → 2*1500=3000
1028+
), // 275 + 3000 = 3275
1029+
},
1030+
{
1031+
name: "only last bucket has requests",
1032+
counts: []int{0, 0, 4},
1033+
buckets: []float64{10, 100, 1000},
1034+
expected: 4 * ((1000 + 2000) / 2), // 4 * 1500 = 6000
1035+
},
1036+
{
1037+
name: "non-integer midpoints rounded down via int64 cast",
1038+
counts: []int{1},
1039+
buckets: []float64{1}, // mid = (1 + 2)/2 = 1.5 → float64(1)*1.5 = 1.5 → int64 = 1
1040+
expected: 1,
1041+
},
1042+
}
1043+
1044+
for _, test := range tests {
1045+
result := estimateTokenTotal(test.counts, test.buckets)
1046+
Expect(result).To(Equal(test.expected), "test case: %s", test.name)
1047+
}
1048+
})
1049+
})

pkg/llm-d-inference-sim/simulator.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ type metricsData struct {
134134
requestPromptTokens *prometheus.HistogramVec
135135
// requestGenerationTokens is prometheus histogram for number of generated tokens in request
136136
requestGenerationTokens *prometheus.HistogramVec
137+
// promptTokensTotal is prometheus counter for total number of input (prompt) tokens
138+
promptTokensTotal *prometheus.CounterVec
139+
// generationTokensTotal is prometheus counter for total number of generated tokens
140+
generationTokensTotal *prometheus.CounterVec
137141
// maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request
138142
maxNumGenerationTokens *prometheus.HistogramVec
139143
// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request

0 commit comments

Comments
 (0)