feat(metrics): add generation_tokens_total and prompt_tokens_total metrics

googs1025 · googs1025 · commit b49b40b25642 · 2025-12-08T19:44:07.000+08:00
Signed-off-by: CYJiang &lt;googs1025@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -34,9 +34,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 | vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
 | vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
 | vllm:request_generation_tokens | Number of generation tokens processed |
+| vllm:generation_tokens_total	 | Total number of generated tokens. |
 | vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned |
 | vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | 
 | vllm:request_prompt_tokens | Number of prefill tokens processed |
+| vllm:prompt_tokens_total	 | Total number of prompt tokens processed |
 | vllm:request_success_total | Count of successfully processed requests |
   
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -252,13 +252,23 @@ type Metrics struct {
 	TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"`
 	// RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init.
 	// Each value will be passed to Observe() once at start-up.
+	// exactly once during initialization. Additionally:
+	//   - The sum of RequestPromptTokens initializes vllm:prompt_tokens_total.
+	//   - The sum of RequestGenerationTokens initializes vllm:generation_tokens_total.
+	// If TotalPromptTokens or TotalGenerationTokens are provided,
+	// they override these sums and are used directly as the total token counts.
 	RequestPromptTokens        []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"`                 // prompt-length samples
 	RequestGenerationTokens    []int `yaml:"request-generation-tokens" json:"request-generation-tokens"`         // generation-length samples
 	RequestParamsMaxTokens     []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"`         // max_tokens parameter samples
 	RequestMaxGenerationTokens []int `yaml:"request-max-generation-tokens" json:"request-max-generation-tokens"` // request_max_num_generation_tokens samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
 
+	// TotalPromptTokens is the total number of prompt tokens processed
+	TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"`
+	// TotalGenerationTokens is the total number of generated tokens
+	TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"`
+
 	// Latency histograms - have same buckets upper boundaries in seconds are:
 	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
 	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -44,6 +44,8 @@ const (
 	generationTokensMetricName       = "vllm:request_generation_tokens"
 	paramMaxTokensMetricName         = "vllm:request_params_max_tokens"
 	promptTokensMetricName           = "vllm:request_prompt_tokens"
+	generationTokensTotalMetricName  = "vllm:generation_tokens_total"
+	promptTokensTotalMetricName      = "vllm:prompt_tokens_total"
 	successTotalMetricName           = "vllm:request_success_total"
 	loraRequestsMetricName           = "vllm:lora_requests_info"
 	reqRunningMetricName             = "vllm:num_requests_running"
@@ -275,6 +277,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 		return err
 	}
 
+	s.metrics.promptTokensTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: "",
+			Name:      promptTokensTotalMetricName,
+			Help:      "Total number of prompt tokens processed.",
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.promptTokensTotal); err != nil {
+		s.logger.Error(err, "prometheus prompt_tokens_total counter register failed")
+		return err
+	}
+
+	s.metrics.generationTokensTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: "",
+			Name:      generationTokensTotalMetricName,
+			Help:      "Total number of generated tokens.",
+		},
+		[]string{vllmapi.PromLabelModelName},
+	)
+
+	if err := s.metrics.registry.Register(s.metrics.generationTokensTotal); err != nil {
+		s.logger.Error(err, "prometheus generation_tokens_total counter register failed")
+		return err
+	}
+
 	s.metrics.requestSuccessTotal = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Subsystem: "",
@@ -325,9 +355,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.Gaug
 		buckets := build125Buckets(s.config.MaxModelLen)
 		if s.config.FakeMetrics.RequestPromptTokens != nil {
 			s.initFakeHistogram(s.metrics.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens)
+			var promptTotal int64
+			if s.config.FakeMetrics.TotalPromptTokens != nil {
+				promptTotal = *s.config.FakeMetrics.TotalPromptTokens
+			} else {
+				promptTotal = estimateTokenTotal(s.config.FakeMetrics.RequestPromptTokens, buckets)
+			}
+			s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTotal))
 		}
 		if s.config.FakeMetrics.RequestGenerationTokens != nil {
 			s.initFakeHistogram(s.metrics.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens)
+			var genTotal int64
+			if s.config.FakeMetrics.TotalGenerationTokens != nil {
+				genTotal = *s.config.FakeMetrics.TotalGenerationTokens
+			} else {
+				genTotal = estimateTokenTotal(s.config.FakeMetrics.RequestGenerationTokens, buckets)
+			}
+			s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(genTotal))
 		}
 		if s.config.FakeMetrics.RequestParamsMaxTokens != nil {
 			s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens)
@@ -708,6 +752,8 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
 	modelName := s.getDisplayedModelName(s.config.Model)
 	s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
 	s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
+	s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTokens))
+	s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(generationTokens))
 	if maxTokens != nil {
 		s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
 	}
@@ -745,3 +791,34 @@ func build125Buckets(maxValue int) []float64 {
 	}
 	return buckets
 }
+
+// estimateTokenTotal estimates the total number of tokens based on histogram bucket boundaries
+// and the number of requests in each bucket. It assumes that requests in a bucket have token
+// lengths uniformly distributed between the bucket's lower and upper bounds, and uses the
+// midpoint as a representative value for estimation.
+//
+// The last bucket is treated as [buckets[len(buckets)-1], +Inf), so its upper bound is approximated
+// as twice the lower bound for midpoint calculation.
+func estimateTokenTotal(counts []int, buckets []float64) int64 {
+	if len(counts) == 0 || len(buckets) == 0 {
+		return 0
+	}
+	var total int64
+	n := len(buckets)
+	for i, count := range counts {
+		if count == 0 {
+			continue
+		}
+		var lower, upper float64
+		lower = buckets[i]
+		if i+1 < n {
+			upper = buckets[i+1]
+		} else {
+			// Approximate upper bound for the last (+Inf) bucket
+			upper = lower * 2
+		}
+		mid := (lower + upper) / 2.0
+		total += int64(float64(count) * mid)
+	}
+	return total
+}
diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go
@@ -162,6 +162,7 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 		}
 		Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1)))
 		Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1)))
+		Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 25`))
 
 		// request_generation_tokens
 		// We do not verify the distribution of the number of tokens generated per request,
@@ -704,12 +705,46 @@ var _ = Describe("Simulator metrics", Ordered, func() {
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount)))
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount)))
 			Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount)))
+			Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 310`))
+			Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 310`))
 
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`))
 			Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`))
 		})
+		It("Should use TotalPromptTokens and TotalGenerationTokens if provided", func() {
+			ctx := context.TODO()
+			args := []string{
+				"cmd", "--model", testModel, "--mode", common.ModeRandom,
+				"--fake-metrics",
+				`{` +
+					`"running-requests":5,` +
+					`"waiting-requests":2,` +
+					`"kv-cache-usage":0.1,` +
+					`"request-prompt-tokens":[100,200],` +
+					`"request-generation-tokens":[50,150],` +
+					`"total-prompt-tokens":12345,` + // explicit total
+					`"total-generation-tokens":67890,` + // explicit total
+					`"request-success-total":{"stop":10}` +
+					`}`,
+			}
+
+			client, err := startServerWithArgs(ctx, args)
+			Expect(err).NotTo(HaveOccurred())
+
+			resp, err := client.Get(metricsUrl)
+			Expect(err).NotTo(HaveOccurred())
+			Expect(resp.StatusCode).To(Equal(http.StatusOK))
+
+			data, err := io.ReadAll(resp.Body)
+			Expect(err).NotTo(HaveOccurred())
+			metrics := string(data)
+
+			// Verify that the explicit totals are used
+			Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 12345`))
+			Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 67890`))
+		})
 	})
 
 	Context("fake ttft metrics", func() {
@@ -940,3 +975,75 @@ var _ = Describe("build125Buckets", Ordered, func() {
 		}
 	})
 })
+
+var _ = Describe("estimateTokenTotal", func() {
+	It("should correctly estimate total tokens from bucket counts and boundaries", func() {
+		tests := []struct {
+			name     string
+			counts   []int
+			buckets  []float64
+			expected int64
+		}{
+			{
+				name:     "empty counts",
+				counts:   []int{},
+				buckets:  []float64{1, 2, 5},
+				expected: 0,
+			},
+			{
+				name:     "empty buckets",
+				counts:   []int{10, 20},
+				buckets:  []float64{},
+				expected: 0,
+			},
+			{
+				name:     "single bucket, single request",
+				counts:   []int{1},
+				buckets:  []float64{10},
+				expected: 15, // bucket [10, +Inf) → upper = 20, mid = (10+20)/2 = 15
+			},
+			{
+				name:     "two buckets, exact midpoints",
+				counts:   []int{2, 3},
+				buckets:  []float64{0, 10, 20}, // buckets: [0,10), [10,20), [20,+Inf) — but only 2 counts → use first two
+				expected: int64(2*5 + 3*15),    // (0+10)/2=5, (10+20)/2=15 → 10 + 45 = 55
+			},
+			{
+				name:    "three buckets including last (+Inf)",
+				counts:  []int{1, 1, 1},
+				buckets: []float64{10, 20, 50},
+				expected: int64(
+					1*((10+20)/2) + // 15
+						1*((20+50)/2) + // 35
+						1*((50+50*2)/2), // last bucket: upper = 50*2 = 100 → mid = (50+100)/2 = 75
+				), // 15 + 35 + 75 = 125
+			},
+			{
+				name:    "zero counts in some buckets",
+				counts:  []int{0, 5, 0, 2},
+				buckets: []float64{1, 10, 100, 1000},
+				expected: int64(
+					5*((10+100)/2) + // 5 * 55 = 275
+						2*((1000+1000*2)/2), // last bucket: 1000*2=2000 → mid=1500 → 2*1500=3000
+				), // 275 + 3000 = 3275
+			},
+			{
+				name:     "only last bucket has requests",
+				counts:   []int{0, 0, 4},
+				buckets:  []float64{10, 100, 1000},
+				expected: 4 * ((1000 + 2000) / 2), // 4 * 1500 = 6000
+			},
+			{
+				name:     "non-integer midpoints rounded down via int64 cast",
+				counts:   []int{1},
+				buckets:  []float64{1}, // mid = (1 + 2)/2 = 1.5 → float64(1)*1.5 = 1.5 → int64 = 1
+				expected: 1,
+			},
+		}
+
+		for _, test := range tests {
+			result := estimateTokenTotal(test.counts, test.buckets)
+			Expect(result).To(Equal(test.expected), "test case: %s", test.name)
+		}
+	})
+})
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -134,6 +134,10 @@ type metricsData struct {
 	requestPromptTokens *prometheus.HistogramVec
 	// requestGenerationTokens is prometheus histogram for number of generated tokens in request
 	requestGenerationTokens *prometheus.HistogramVec
+	// promptTokensTotal is prometheus counter for total number of input (prompt) tokens
+	promptTokensTotal *prometheus.CounterVec
+	// generationTokensTotal is prometheus counter for total number of generated tokens
+	generationTokensTotal *prometheus.CounterVec
 	// maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request
 	maxNumGenerationTokens *prometheus.HistogramVec
 	// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request