Skip to content

Commit 4ee6b72

Browse files
committed
feat(metrics): add generation_tokens_total and prompt_tokens_total metrics
Signed-off-by: CYJiang <[email protected]>
1 parent 5c58b12 commit 4ee6b72

File tree

5 files changed

+247
-0
lines changed

5 files changed

+247
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
3434
| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
3535
| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
3636
| vllm:request_generation_tokens | Number of generation tokens processed |
37+
| vllm:generation_tokens_total | Total number of generated tokens. |
3738
| vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
3839
| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter |
3940
| vllm:request_prompt_tokens | Number of prefill tokens processed |
41+
| vllm:prompt_tokens_total | Total number of prompt tokens processed |
4042
| vllm:request_success_total | Count of successfully processed requests |
4143

4244
The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.

pkg/common/config.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,23 @@ type Metrics struct {
252252
TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
253253
// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
254254
// Each value will be passed to Observe() once at start-up.
255+
// exactly once during initialization. Additionally:
256+
// - The sum of RequestPromptTokens initializes vllm:prompt_tokens_total.
257+
// - The sum of RequestGenerationTokens initializes vllm:generation_tokens_total.
258+
// If TotalPromptTokens or TotalGenerationTokens are provided,
259+
// they override these sums and are used directly as the total token counts.
255260
RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples
256261
RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples
257262
RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
258263
RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
259264
// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
260265
RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
261266

267+
// TotalPromptTokens is the total number of prompt tokens processed
268+
TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
269+
// TotalGenerationTokens is the total number of generated tokens
270+
TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`
271+
262272
// Latency histograms - have same buckets upper boundaries in seconds are:
263273
// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
264274
// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf

pkg/llm-d-inference-sim/metrics.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ const (
4444
generationTokensMetricName = "vllm:request_generation_tokens"
4545
paramMaxTokensMetricName = "vllm:request_params_max_tokens"
4646
promptTokensMetricName = "vllm:request_prompt_tokens"
47+
generationTokensTotalMetricName = "vllm:generation_tokens_total"
48+
promptTokensTotalMetricName = "vllm:prompt_tokens_total"
4749
successTotalMetricName = "vllm:request_success_total"
4850
loraRequestsMetricName = "vllm:lora_requests_info"
4951
reqRunningMetricName = "vllm:num_requests_running"
@@ -275,6 +277,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
275277
return err
276278
}
277279

280+
s.metrics.promptTokensTotal = prometheus.NewCounterVec(
281+
prometheus.CounterOpts{
282+
Subsystem: "",
283+
Name: promptTokensTotalMetricName,
284+
Help: "Total number of prompt tokens processed.",
285+
},
286+
[]string{vllmapi.PromLabelModelName},
287+
)
288+
289+
if err := s.metrics.registry.Register(s.metrics.promptTokensTotal); err != nil {
290+
s.logger.Error(err, "prometheus prompt_tokens_total counter register failed")
291+
return err
292+
}
293+
294+
s.metrics.generationTokensTotal = prometheus.NewCounterVec(
295+
prometheus.CounterOpts{
296+
Subsystem: "",
297+
Name: generationTokensTotalMetricName,
298+
Help: "Total number of generated tokens.",
299+
},
300+
[]string{vllmapi.PromLabelModelName},
301+
)
302+
303+
if err := s.metrics.registry.Register(s.metrics.generationTokensTotal); err != nil {
304+
s.logger.Error(err, "prometheus generation_tokens_total counter register failed")
305+
return err
306+
}
307+
278308
s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
279309
prometheus.CounterOpts{
280310
Subsystem: "",
@@ -325,9 +355,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.Gaug
325355
buckets := build125Buckets(s.config.MaxModelLen)
326356
if s.config.FakeMetrics.RequestPromptTokens != nil {
327357
s.initFakeHistogram(s.metrics.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens)
358+
var promptTotal int64
359+
if s.config.FakeMetrics.TotalPromptTokens != nil {
360+
promptTotal = *s.config.FakeMetrics.TotalPromptTokens
361+
} else {
362+
promptTotal = estimateTokenTotal(s.config.FakeMetrics.RequestPromptTokens, buckets)
363+
}
364+
s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTotal))
328365
}
329366
if s.config.FakeMetrics.RequestGenerationTokens != nil {
330367
s.initFakeHistogram(s.metrics.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens)
368+
var genTotal int64
369+
if s.config.FakeMetrics.TotalGenerationTokens != nil {
370+
genTotal = *s.config.FakeMetrics.TotalGenerationTokens
371+
} else {
372+
genTotal = estimateTokenTotal(s.config.FakeMetrics.RequestGenerationTokens, buckets)
373+
}
374+
s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(genTotal))
331375
}
332376
if s.config.FakeMetrics.RequestParamsMaxTokens != nil {
333377
s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens)
@@ -708,6 +752,8 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
708752
modelName := s.getDisplayedModelName(s.config.Model)
709753
s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
710754
s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
755+
s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTokens))
756+
s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(generationTokens))
711757
if maxTokens != nil {
712758
s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
713759
}
@@ -745,3 +791,46 @@ func build125Buckets(maxValue int) []float64 {
745791
}
746792
return buckets
747793
}
794+
795+
// estimateTokenTotal estimates the total number of tokens based on histogram bucket boundaries
796+
// and the number of requests in each bucket. It assumes that requests in a bucket have token
797+
// lengths uniformly distributed between the bucket's lower and upper bounds, and uses the
798+
// midpoint as a representative value for estimation.
799+
//
800+
// The last bucket is treated as [buckets[len(buckets)-1], +Inf), so its upper bound is approximated
801+
// as twice the lower bound for midpoint calculation.
802+
func estimateTokenTotal(counts []int, buckets []float64) int64 {
803+
if len(counts) == 0 || len(buckets) == 0 {
804+
return 0
805+
}
806+
var total int64
807+
nBuckets := len(buckets)
808+
nCounts := len(counts)
809+
810+
for i := 0; i <= nBuckets; i++ {
811+
count := 0
812+
if i < nCounts {
813+
count = counts[i]
814+
}
815+
if count == 0 {
816+
continue
817+
}
818+
var lower, upper float64
819+
if i == 0 {
820+
// First bucket: [0, buckets[0]]
821+
lower = 0.0
822+
upper = buckets[0]
823+
} else if i < nBuckets {
824+
// Middle buckets: (buckets[i-1], buckets[i]]
825+
lower = buckets[i-1]
826+
upper = buckets[i]
827+
} else {
828+
// Last bucket: (buckets[nBuckets-1], +Inf)
829+
lower = buckets[nBuckets-1]
830+
upper = lower * 2
831+
}
832+
mid := (lower + upper) / 2.0
833+
total += int64(float64(count) * mid)
834+
}
835+
return total
836+
}

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
162162
}
163163
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1)))
164164
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1)))
165+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 25`))
165166

166167
// request_generation_tokens
167168
// We do not verify the distribution of the number of tokens generated per request,
@@ -704,12 +705,46 @@ var _ = Describe("Simulator metrics", Ordered, func() {
704705
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount)))
705706
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount)))
706707
Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount)))
708+
Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 140`))
709+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 140`))
707710

708711
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
709712
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
710713
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`))
711714
Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`))
712715
})
716+
It("Should use TotalPromptTokens and TotalGenerationTokens if provided", func() {
717+
ctx := context.TODO()
718+
args := []string{
719+
"cmd", "--model", testModel, "--mode", common.ModeRandom,
720+
"--fake-metrics",
721+
`{` +
722+
`"running-requests":5,` +
723+
`"waiting-requests":2,` +
724+
`"kv-cache-usage":0.1,` +
725+
`"request-prompt-tokens":[100,200],` +
726+
`"request-generation-tokens":[50,150],` +
727+
`"total-prompt-tokens":12345,` + // explicit total
728+
`"total-generation-tokens":67890,` + // explicit total
729+
`"request-success-total":{"stop":10}` +
730+
`}`,
731+
}
732+
733+
client, err := startServerWithArgs(ctx, args)
734+
Expect(err).NotTo(HaveOccurred())
735+
736+
resp, err := client.Get(metricsUrl)
737+
Expect(err).NotTo(HaveOccurred())
738+
Expect(resp.StatusCode).To(Equal(http.StatusOK))
739+
740+
data, err := io.ReadAll(resp.Body)
741+
Expect(err).NotTo(HaveOccurred())
742+
metrics := string(data)
743+
744+
// Verify that the explicit totals are used
745+
Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 12345`))
746+
Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 67890`))
747+
})
713748
})
714749

715750
Context("fake ttft metrics", func() {
@@ -940,3 +975,110 @@ var _ = Describe("build125Buckets", Ordered, func() {
940975
}
941976
})
942977
})
978+
979+
var _ = Describe("estimateTokenTotal", func() {
980+
It("should correctly estimate total tokens from bucket counts and boundaries", func() {
981+
tests := []struct {
982+
name string
983+
counts []int
984+
buckets []float64
985+
expected int64
986+
}{
987+
{
988+
name: "empty counts",
989+
counts: []int{},
990+
buckets: []float64{1, 2, 5},
991+
expected: 0,
992+
},
993+
{
994+
name: "empty buckets",
995+
counts: []int{10, 20},
996+
buckets: []float64{},
997+
expected: 0,
998+
},
999+
{
1000+
name: "only first bucket has requests: [0,10]",
1001+
counts: []int{1},
1002+
buckets: []float64{10},
1003+
expected: 5,
1004+
// bucket0: [0,10] → mid=5 → 1*5 = 5
1005+
// total = 5
1006+
},
1007+
{
1008+
name: "first two buckets: [0,10], (10,20]",
1009+
counts: []int{2, 3},
1010+
buckets: []float64{10, 20},
1011+
expected: 55,
1012+
// bucket0: [0,10] → mid=5 → 2*5 = 10
1013+
// bucket1: (10,20] → mid=15 → 3*15 = 45
1014+
// total = 10 + 45 = 55
1015+
},
1016+
{
1017+
name: "three finite buckets + last (+Inf) bucket",
1018+
counts: []int{1, 1, 1, 1},
1019+
buckets: []float64{10, 20, 50},
1020+
expected: 130,
1021+
// bucket0: [0,10] → mid=5 → 1*5 = 5
1022+
// bucket1: (10,20] → mid=15 → 1*15 = 15
1023+
// bucket2: (20,50] → mid=35 → 1*35 = 35
1024+
// bucket3: (50,+Inf) → upper=100, mid=75 → 1*75 = 75
1025+
// total = 5 + 15 + 35 + 75 = 130
1026+
},
1027+
{
1028+
name: "zero counts in some buckets",
1029+
counts: []int{0, 5, 0, 2},
1030+
buckets: []float64{1, 10, 100},
1031+
expected: 327,
1032+
// bucket1: (1,10] → mid=5.5 → 5*5.5 = 27.5 → truncated to 27
1033+
// bucket3: (100,+Inf) → upper=200, mid=150 → 2*150 = 300
1034+
// total = 27 + 300 = 327
1035+
},
1036+
{
1037+
name: "only last bucket has requests",
1038+
counts: []int{0, 0, 0, 4},
1039+
buckets: []float64{10, 100, 1000},
1040+
expected: 6000,
1041+
// bucket3: (1000,+Inf) → upper=2000, mid=1500 → 4*1500 = 6000
1042+
// total = 4*1500 = 6000
1043+
},
1044+
{
1045+
name: "non-integer midpoints truncated by int64 cast",
1046+
counts: []int{1},
1047+
buckets: []float64{1},
1048+
expected: 0,
1049+
// bucket0: [0,1] → mid=0.5 → 1*0.5 = 0.5 → truncated to 0
1050+
},
1051+
{
1052+
name: "collaborator example: [10,20,30] with long buckets",
1053+
counts: []int{10, 20, 30},
1054+
buckets: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
1055+
expected: 140,
1056+
// bucket0: [0,1] → mid=0.5 → 10*0.5 = 5
1057+
// bucket1: (1,2] → mid=1.5 → 20*1.5 = 30
1058+
// bucket2: (2,5] → mid=3.5 → 30*3.5 = 105
1059+
// total = 5 + 30 + 105 = 140
1060+
},
1061+
{
1062+
name: "counts shorter than buckets (trailing zeros omitted)",
1063+
counts: []int{1, 1},
1064+
buckets: []float64{10, 100, 1000, 10000},
1065+
expected: 60,
1066+
// bucket0: [0,10] → mid=5 → 1*5 = 5
1067+
// bucket1: (10,100] → mid=55 → 1*55 = 55
1068+
// total = 5 + 55 = 60
1069+
},
1070+
{
1071+
name: "all zero counts",
1072+
counts: []int{0, 0, 0},
1073+
buckets: []float64{1, 10, 100},
1074+
expected: 0,
1075+
// all buckets have zero requests
1076+
},
1077+
}
1078+
1079+
for _, test := range tests {
1080+
result := estimateTokenTotal(test.counts, test.buckets)
1081+
Expect(result).To(Equal(test.expected), "test case: %s", test.name)
1082+
}
1083+
})
1084+
})

pkg/llm-d-inference-sim/simulator.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ type metricsData struct {
134134
requestPromptTokens *prometheus.HistogramVec
135135
// requestGenerationTokens is prometheus histogram for number of generated tokens in request
136136
requestGenerationTokens *prometheus.HistogramVec
137+
// promptTokensTotal is prometheus counter for total number of input (prompt) tokens
138+
promptTokensTotal *prometheus.CounterVec
139+
// generationTokensTotal is prometheus counter for total number of generated tokens
140+
generationTokensTotal *prometheus.CounterVec
137141
// maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request
138142
maxNumGenerationTokens *prometheus.HistogramVec
139143
// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request

0 commit comments

Comments
 (0)