From f9ffada066e8cbee51b799185d3142e334ca5e7c Mon Sep 17 00:00:00 2001 From: CYJiang Date: Thu, 4 Dec 2025 20:30:07 +0800 Subject: [PATCH] feat(metrics): add generation_tokens_total and prompt_tokens_total metrics Signed-off-by: CYJiang --- README.md | 2 + pkg/common/config.go | 13 ++- pkg/llm-d-inference-sim/metrics.go | 93 ++++++++++++++++ pkg/llm-d-inference-sim/metrics_test.go | 142 ++++++++++++++++++++++++ pkg/llm-d-inference-sim/simulator.go | 4 + 5 files changed, 253 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 610eefd..e2c08c1 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,11 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar | vllm:time_per_output_token_seconds | Histogram of time per output token in seconds | | vllm:inter_token_latency_seconds | Histogram of inter-token latency in seconds | | vllm:request_generation_tokens | Number of generation tokens processed | +| vllm:generation_tokens_total | Total number of generated tokens. | | vllm:max_num_generation_tokens | Maximum number of requested generation tokens. Currently same as `vllm:request_generation_tokens` since always only one choice is returned | | vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | | vllm:request_prompt_tokens | Number of prefill tokens processed | +| vllm:prompt_tokens_total | Total number of prompt tokens processed | | vllm:request_success_total | Count of successfully processed requests | The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint. diff --git a/pkg/common/config.go b/pkg/common/config.go index 64c8fe3..38c5efa 100644 --- a/pkg/common/config.go +++ b/pkg/common/config.go @@ -262,7 +262,13 @@ type Metrics struct { // 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf TPOTBucketValues []int `yaml:"tpot-buckets-values" json:"tpot-buckets-values"` // RequestPromptTokens RequestGenerationTokens RequestParamsMaxTokens Histogram fake-observation arrays for init. - // Each value will be passed to Observe() once at start-up. + // Each value in these arrays is passed to Observe() exactly once at startup. + // By default: + // - The sum of RequestPromptTokens initializes the metric vllm:prompt_tokens_total. + // - The sum of RequestGenerationTokens initializes the metric vllm:generation_tokens_total. + // + // If TotalPromptTokens or TotalGenerationTokens are explicitly provided, + // they override the above sums and are used directly as the initial total token counts. RequestPromptTokens []int `yaml:"request-prompt-tokens" json:"request-prompt-tokens"` // prompt-length samples RequestGenerationTokens []int `yaml:"request-generation-tokens" json:"request-generation-tokens"` // generation-length samples RequestParamsMaxTokens []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples @@ -270,6 +276,11 @@ type Metrics struct { // RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.). RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"` + // TotalPromptTokens is the total number of prompt tokens processed + TotalPromptTokens *int64 `json:"total-prompt-tokens,omitempty"` + // TotalGenerationTokens is the total number of generated tokens + TotalGenerationTokens *int64 `json:"total-generation-tokens,omitempty"` + // Latency histograms - have same buckets upper boundaries in seconds are: // 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, // 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go index cc20bcd..1c6e560 100644 --- a/pkg/llm-d-inference-sim/metrics.go +++ b/pkg/llm-d-inference-sim/metrics.go @@ -45,6 +45,8 @@ const ( generationTokensMetricName = "vllm:request_generation_tokens" paramMaxTokensMetricName = "vllm:request_params_max_tokens" promptTokensMetricName = "vllm:request_prompt_tokens" + generationTokensTotalMetricName = "vllm:generation_tokens_total" + promptTokensTotalMetricName = "vllm:prompt_tokens_total" successTotalMetricName = "vllm:request_success_total" loraRequestsMetricName = "vllm:lora_requests_info" reqRunningMetricName = "vllm:num_requests_running" @@ -292,6 +294,34 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error { return err } + s.metrics.promptTokensTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: "", + Name: promptTokensTotalMetricName, + Help: "Total number of prompt tokens processed.", + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.promptTokensTotal); err != nil { + s.logger.Error(err, "prometheus prompt_tokens_total counter register failed") + return err + } + + s.metrics.generationTokensTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: "", + Name: generationTokensTotalMetricName, + Help: "Total number of generated tokens.", + }, + []string{vllmapi.PromLabelModelName}, + ) + + if err := s.metrics.registry.Register(s.metrics.generationTokensTotal); err != nil { + s.logger.Error(err, "prometheus generation_tokens_total counter register failed") + return err + } + s.metrics.requestSuccessTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Subsystem: "", @@ -343,9 +373,23 @@ func (s *VllmSimulator) setInitialPrometheusMetrics(cacheConfig *prometheus.Gaug buckets := build125Buckets(s.config.MaxModelLen) if s.config.FakeMetrics.RequestPromptTokens != nil { s.initFakeHistogram(s.metrics.requestPromptTokens, buckets, s.config.FakeMetrics.RequestPromptTokens) + var promptTotal int64 + if s.config.FakeMetrics.TotalPromptTokens != nil { + promptTotal = *s.config.FakeMetrics.TotalPromptTokens + } else { + promptTotal = estimateTokenTotal(s.config.FakeMetrics.RequestPromptTokens, buckets) + } + s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTotal)) } if s.config.FakeMetrics.RequestGenerationTokens != nil { s.initFakeHistogram(s.metrics.requestParamsMaxTokens, buckets, s.config.FakeMetrics.RequestGenerationTokens) + var genTotal int64 + if s.config.FakeMetrics.TotalGenerationTokens != nil { + genTotal = *s.config.FakeMetrics.TotalGenerationTokens + } else { + genTotal = estimateTokenTotal(s.config.FakeMetrics.RequestGenerationTokens, buckets) + } + s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(genTotal)) } if s.config.FakeMetrics.RequestParamsMaxTokens != nil { s.initFakeHistogram(s.metrics.requestGenerationTokens, buckets, s.config.FakeMetrics.RequestParamsMaxTokens) @@ -727,6 +771,8 @@ func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens, modelName := s.getDisplayedModelName(s.config.Model) s.metrics.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens)) s.metrics.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens)) + s.metrics.promptTokensTotal.WithLabelValues(modelName).Add(float64(promptTokens)) + s.metrics.generationTokensTotal.WithLabelValues(modelName).Add(float64(generationTokens)) if maxTokens != nil { s.metrics.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens)) } @@ -764,3 +810,50 @@ func build125Buckets(maxValue int) []float64 { } return buckets } + +// estimateTokenTotal estimates the total number of tokens based on histogram bucket boundaries +// and the number of requests in each bucket. It assumes that requests in a bucket have token +// lengths uniformly distributed between the bucket's lower and upper bounds, and uses the +// midpoint as a representative value for estimation. +// +// The last bucket is treated as [buckets[len(buckets)-1], +Inf), so its upper bound is approximated +// as twice the lower bound for midpoint calculation. +func estimateTokenTotal(counts []int, buckets []float64) int64 { + if len(counts) == 0 || len(buckets) == 0 { + return 0 + } + + nCounts := len(counts) + nBuckets := len(buckets) + + var total int64 + lower := 0.0 + + for i := 0; i < nCounts; i++ { + count := counts[i] + if count == 0 { + // Advance lower bound even if count is zero, to stay aligned with buckets + if i < nBuckets { + lower = buckets[i] + } + continue + } + + var upper float64 + if i < nBuckets { + // Bucket i corresponds to (lower, buckets[i]] + upper = buckets[i] + } else { + // Last bucket: (buckets[nBuckets-1], +Inf) → approximate upper = 2 * lower + upper = lower * 2.0 + } + + mid := (lower + upper) / 2.0 + total += int64(float64(count) * mid) + + // Update lower for next iteration + lower = upper + } + + return total +} diff --git a/pkg/llm-d-inference-sim/metrics_test.go b/pkg/llm-d-inference-sim/metrics_test.go index 1bbb307..5c13170 100644 --- a/pkg/llm-d-inference-sim/metrics_test.go +++ b/pkg/llm-d-inference-sim/metrics_test.go @@ -162,6 +162,7 @@ var _ = Describe("Simulator metrics", Ordered, func() { } Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), 1))) Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), 1))) + Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 25`)) // request_generation_tokens // We do not verify the distribution of the number of tokens generated per request, @@ -710,12 +711,46 @@ var _ = Describe("Simulator metrics", Ordered, func() { Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, generationTokensMetricName, math.Inf(1), expectedCount))) Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, promptTokensMetricName, math.Inf(1), expectedCount))) Expect(metrics).To(ContainSubstring(getFloatBucketMetricLine(testModel, paramMaxTokensMetricName, math.Inf(1), expectedCount))) + Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 140`)) + Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 140`)) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="length",model_name="testmodel"} 0`)) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="remote_decode",model_name="testmodel"} 0`)) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="stop",model_name="testmodel"} 20`)) Expect(metrics).To(ContainSubstring(`vllm:request_success_total{finish_reason="tool_calls",model_name="testmodel"} 0`)) }) + It("Should use TotalPromptTokens and TotalGenerationTokens if provided", func() { + ctx := context.TODO() + args := []string{ + "cmd", "--model", testModel, "--mode", common.ModeRandom, + "--fake-metrics", + `{` + + `"running-requests":5,` + + `"waiting-requests":2,` + + `"kv-cache-usage":0.1,` + + `"request-prompt-tokens":[100,200],` + + `"request-generation-tokens":[50,150],` + + `"total-prompt-tokens":12345,` + // explicit total + `"total-generation-tokens":67890,` + // explicit total + `"request-success-total":{"stop":10}` + + `}`, + } + + client, err := startServerWithArgs(ctx, args) + Expect(err).NotTo(HaveOccurred()) + + resp, err := client.Get(metricsUrl) + Expect(err).NotTo(HaveOccurred()) + Expect(resp.StatusCode).To(Equal(http.StatusOK)) + + data, err := io.ReadAll(resp.Body) + Expect(err).NotTo(HaveOccurred()) + metrics := string(data) + + // Verify that the explicit totals are used + Expect(metrics).To(MatchRegexp(`vllm:prompt_tokens_total{model_name="testmodel"} 12345`)) + Expect(metrics).To(MatchRegexp(`vllm:generation_tokens_total{model_name="testmodel"} 67890`)) + }) }) Context("fake ttft metrics", func() { @@ -946,3 +981,110 @@ var _ = Describe("build125Buckets", Ordered, func() { } }) }) + +var _ = Describe("estimateTokenTotal", func() { + It("should correctly estimate total tokens from bucket counts and boundaries", func() { + tests := []struct { + name string + counts []int + buckets []float64 + expected int64 + }{ + { + name: "empty counts", + counts: []int{}, + buckets: []float64{1, 2, 5}, + expected: 0, + }, + { + name: "empty buckets", + counts: []int{10, 20}, + buckets: []float64{}, + expected: 0, + }, + { + name: "only first bucket has requests: [0,10]", + counts: []int{1}, + buckets: []float64{10}, + expected: 5, + // bucket0: [0,10] → mid=5 → 1*5 = 5 + // total = 5 + }, + { + name: "first two buckets: [0,10], (10,20]", + counts: []int{2, 3}, + buckets: []float64{10, 20}, + expected: 55, + // bucket0: [0,10] → mid=5 → 2*5 = 10 + // bucket1: (10,20] → mid=15 → 3*15 = 45 + // total = 10 + 45 = 55 + }, + { + name: "three finite buckets + last (+Inf) bucket", + counts: []int{1, 1, 1, 1}, + buckets: []float64{10, 20, 50}, + expected: 130, + // bucket0: [0,10] → mid=5 → 1*5 = 5 + // bucket1: (10,20] → mid=15 → 1*15 = 15 + // bucket2: (20,50] → mid=35 → 1*35 = 35 + // bucket3: (50,+Inf) → upper=100, mid=75 → 1*75 = 75 + // total = 5 + 15 + 35 + 75 = 130 + }, + { + name: "zero counts in some buckets", + counts: []int{0, 5, 0, 2}, + buckets: []float64{1, 10, 100}, + expected: 327, + // bucket1: (1,10] → mid=5.5 → 5*5.5 = 27.5 → truncated to 27 + // bucket3: (100,+Inf) → upper=200, mid=150 → 2*150 = 300 + // total = 27 + 300 = 327 + }, + { + name: "only last bucket has requests", + counts: []int{0, 0, 0, 4}, + buckets: []float64{10, 100, 1000}, + expected: 6000, + // bucket3: (1000,+Inf) → upper=2000, mid=1500 → 4*1500 = 6000 + // total = 4*1500 = 6000 + }, + { + name: "non-integer midpoints truncated by int64 cast", + counts: []int{1}, + buckets: []float64{1}, + expected: 0, + // bucket0: [0,1] → mid=0.5 → 1*0.5 = 0.5 → truncated to 0 + }, + { + name: "collaborator example: [10,20,30] with long buckets", + counts: []int{10, 20, 30}, + buckets: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}, + expected: 140, + // bucket0: [0,1] → mid=0.5 → 10*0.5 = 5 + // bucket1: (1,2] → mid=1.5 → 20*1.5 = 30 + // bucket2: (2,5] → mid=3.5 → 30*3.5 = 105 + // total = 5 + 30 + 105 = 140 + }, + { + name: "counts shorter than buckets (trailing zeros omitted)", + counts: []int{1, 1}, + buckets: []float64{10, 100, 1000, 10000}, + expected: 60, + // bucket0: [0,10] → mid=5 → 1*5 = 5 + // bucket1: (10,100] → mid=55 → 1*55 = 55 + // total = 5 + 55 = 60 + }, + { + name: "all zero counts", + counts: []int{0, 0, 0}, + buckets: []float64{1, 10, 100}, + expected: 0, + // all buckets have zero requests + }, + } + + for _, test := range tests { + result := estimateTokenTotal(test.counts, test.buckets) + Expect(result).To(Equal(test.expected), "test case: %s", test.name) + } + }) +}) diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go index f097853..38dae09 100644 --- a/pkg/llm-d-inference-sim/simulator.go +++ b/pkg/llm-d-inference-sim/simulator.go @@ -136,6 +136,10 @@ type metricsData struct { requestPromptTokens *prometheus.HistogramVec // requestGenerationTokens is prometheus histogram for number of generated tokens in request requestGenerationTokens *prometheus.HistogramVec + // promptTokensTotal is prometheus counter for total number of input (prompt) tokens + promptTokensTotal *prometheus.CounterVec + // generationTokensTotal is prometheus counter for total number of generated tokens + generationTokensTotal *prometheus.CounterVec // maxNumGenerationTokens is prometheus histogram for maximum number of generated tokens in request maxNumGenerationTokens *prometheus.HistogramVec // requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request