feat(benchmark): 优化基准测试流程并添加分析工作流

h7ml · h7ml · commit 2a430955b393 · 2025-03-25T01:47:36.000+08:00
- 新增 GitHub Actions 工作流，实现自动化分析和报告生成
- 重构基准测试代码，使用多线程提高执行效率
- 优化报告输出格式，增加总运行时间和批次信息
- 更新依赖库版本，提高代码兼容性和性能
diff --git a/.github/workflows/analyze.yml b/.github/workflows/analyze.yml
@@ -0,0 +1,114 @@
+name: Analyze
+
+on:
+  push:
+    branches:
+      - feat/benchmark
+  workflow_dispatch:
+
+permissions:
+  contents: write
+  pull-requests: write
+  actions: write
+
+jobs:
+  analyze:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: feat/benchmark
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+          cache: 'npm'
+
+      - name: Setup PNPM
+        uses: pnpm/action-setup@v4
+        with:
+          version: 9
+
+      - name: Cache dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            **/node_modules
+            ~/.pnpm-store
+          key: ${{ runner.os }}-pnpm-${{ hashFiles('**/pnpm-lock.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-pnpm-
+
+      - name: Install dependencies
+        run: pnpm install --frozen-lockfile
+        continue-on-error: false
+
+      - name: Set environment variables
+        run: |
+          if [ -z "${{ secrets.OPENAI_BENCHMARK_API_KEY }}" ]; then
+            echo "Error: OPENAI_API_KEY is not set"
+            exit 1
+          fi
+          echo "OPENAI_BASE_URL=${{ secrets.OPENAI_BENCHMARK_BASE_URL }}" >> .env.local
+          echo "OPENAI_API_KEY=${{ secrets.OPENAI_BENCHMARK_API_KEY }}" >> .env.local
+
+      - name: Build
+        run: |
+          pnpm build
+        continue-on-error: false
+
+      - name: Run benchmark
+        id: benchmark
+        run: pnpm benchmark
+        continue-on-error: true
+
+      - name: Run analysis
+        if: steps.benchmark.outcome == 'success'
+        run: pnpm analyze
+        continue-on-error: true
+
+      - name: Upload report
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-report-${{ github.sha }}
+          path: benchmark
+          retention-days: 7
+
+      - name: Configure Git
+        run: |
+          git config --local user.email "action@h7ml.cn"
+          git config --local user.name "GitHub Action"
+          git config --local core.autocrlf false
+          git config --local core.safecrlf false
+          git config advice.ignoredHook false
+
+      - name: Commit and Push Changes
+        run: |
+          if [[ -n $(git status --porcelain) ]]; then
+            git add .
+            git commit -m "feat: update benchmark results [skip ci]"
+            git push origin feat/benchmark --force || {
+              echo "Push failed, retrying after pull..."
+              git pull --rebase origin feat/benchmark
+              git push origin feat/benchmark --force
+            }
+          else
+            echo "No changes to commit"
+          fi
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Cleanup
+        if: always()
+        run: |
+          rm -rf node_modules
+          rm -rf .env.local
+
+
diff --git a/benchmark/evaluate.js b/benchmark/evaluate.js
@@ -1,7 +1,7 @@
 /* global console */
 import { ChatOpenAI } from '@langchain/openai';
 import { StructuredOutputParser } from 'langchain/output_parsers';
-import { ChatPromptTemplate } from '@langchain/core/prompts';
+import { PromptTemplate } from '@langchain/core/prompts';
 import { config } from 'dotenv';
 import { z } from 'zod';
 
@@ -13,8 +13,8 @@ config({ path: '.env.local' });
  * @param {string} modelName - 模型名称
  * @returns {ChatOpenAI} - LangChain Chat模型实例
  */
-export function getEvaluatorForModel(modelName) {
-  let model = new ChatOpenAI({
+function getEvaluatorForModel(modelName) {
+  return new ChatOpenAI({
     model: modelName,
     openAIApiKey: process.env.OPENAI_API_KEY,
     temperature: 0,
@@ -23,8 +23,6 @@ export function getEvaluatorForModel(modelName) {
       baseUrl: process.env.OPENAI_BASE_URL,
     },
   });
-
-  return model;
 }
 
 /**
@@ -83,10 +81,7 @@ export async function evaluate(
   );
 
   // 构建评估提示
-  const formatInstructions = parser.getFormatInstructions();
-
-  const prompt = ChatPromptTemplate.fromTemplate(
-    `你是一位精通多种语言的专业翻译评估专家，需要对机器翻译的质量进行评估。
+  const prompt = PromptTemplate.fromTemplate(`你是一位精通多种语言的专业翻译评估专家，需要对机器翻译的质量进行评估。
 
 原始文本 (Markdown格式):
 \`\`\`
@@ -110,20 +105,20 @@ export async function evaluate(
 
 最后，提供具体的问题列表和改进建议。
 
-${formatInstructions}
+{formatInstructions}
 `);
 
   try {
-    const response = await evaluator.invoke([
-      await prompt.formatMessages({
-        originalData,
-        translatedData,
-        targetLanguage,
-      }),
-    ]);
-
-    const result = await parser.parse(response.content);
-    return result;
+    const result = await prompt.pipe(evaluator).invoke({
+      originalData,
+      translatedData,
+      targetLanguage,
+      formatInstructions: parser.getFormatInstructions(),
+    });
+
+    const parsed = await parser.parse(result.content);
+    console.log(parsed);
+    return parsed;
   } catch (error) {
     console.error('评估过程出错:', error);
     // 返回默认评估结果
diff --git a/benchmark/index.js b/benchmark/index.js
@@ -1,50 +1,104 @@
-/* global console, setTimeout */
+/* global console */
 
-import { runBenchmark } from './benchmark.js';
+import { Worker } from 'worker_threads';
 import { testModels, testCases } from './testsuit.js';
 import { analyzeReports } from './analyze.js';
 import path from 'path';
 import fs from 'fs';
 import { fileURLToPath } from 'url';
 
+// 获取当前文件的目录路径
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
 // 获取 report 目录的路径
-const reportDir = path.join(path.dirname(fileURLToPath(import.meta.url)), 'report');
+const reportDir = path.join(__dirname, 'report');
 
 // 确保 report 目录存在
 if (!fs.existsSync(reportDir)) {
   fs.mkdirSync(reportDir, { recursive: true });
 }
 
-async function runBenchmarkWithInterval(iterations = 10, intervalMs = 1000) {
-  for (let i = 0; i < iterations; i++) {
-    for (const model of testModels) {
-      for (const testCase of testCases) {
-        console.log(`执行基准测试 [${i + 1}/${iterations}]: ${model.name} / ${testCase.name}`);
+// 创建worker实例
+function createWorker(model, iteration, intervalMs) {
+  return new Promise((resolve, reject) => {
+    const workerPath = path.join(__dirname, 'worker.js');
+    const worker = new Worker(workerPath);
 
-        try {
-          const result = await runBenchmark(model, testCase);
-          reportOutput(result);
+    worker.on('message', (message) => {
+      switch (message.type) {
+        case 'progress':
+          console.log(message.message);
+          break;
+        case 'result':
+          reportOutput(message.result);
           console.log(
-            `完成! 耗时: ${result.timeCost.toFixed(2)}s, 成本: $${(
-              result.inputTokensCost + result.outputTokensCost
-            ).toFixed(5)}`,
+            `完成! ${message.model} / ${message.testCase} (耗时: ${message.timeCost.toFixed(2)}s, 成本: $${message.cost.toFixed(5)})`
           );
+          break;
+        case 'complete':
+          console.log(`模型 ${message.model} 的所有测试完成`);
+          resolve();
+          worker.terminate();
+          break;
+        case 'error':
+          console.error(`基准测试失败 (${message.model}): ${message.error}`);
+          resolve();
+          worker.terminate();
+          break;
+      }
+    });
+
+    worker.on('error', reject);
+    worker.on('exit', (code) => {
+      if (code !== 0) {
+        reject(new Error(`Worker stopped with exit code ${code}`));
+      }
+    });
+
+    // 将测试用例数据也传递给worker
+    worker.postMessage({
+      model,
+      testCases,
+      iteration,
+      intervalMs
+    });
+  });
+}
+
+async function runBenchmarkWithInterval(iterations = 10, intervalMs = 1000) {
+  const startTime = new Date();
+  console.log(`开始基准测试: ${startTime.toLocaleString()}`);
+  console.log(`加载了${testModels.length}个模型`);
+
+  // 设置并发执行的worker数量
+  const maxWorkers = 3;
+
+  for (let i = 0; i < iterations; i++) {
+    // 将模型列表分成多个批次
+    for (let j = 0; j < testModels.length; j += maxWorkers) {
+      const modelBatch = testModels.slice(j, Math.min(j + maxWorkers, testModels.length));
+      console.log(`\n执行第 ${i + 1}/${iterations} 轮，批次 ${Math.floor(j / maxWorkers) + 1}/${Math.ceil(testModels.length / maxWorkers)}`);
+
+      // 创建并发worker
+      const workerPromises = modelBatch.map(model =>
+        createWorker(model, i + 1, intervalMs)
+      );
+
+      // 等待当前批次完成
+      await Promise.all(workerPromises);
 
-          // 添加间隔，避免API限流
-          if (
-            i < iterations - 1 ||
-            model !== testModels[testModels.length - 1] ||
-            testCase !== testCases[testCases.length - 1]
-          ) {
-            await new Promise((resolve) => setTimeout(resolve, intervalMs));
-          }
-        } catch (error) {
-          console.error(`基准测试失败: ${error.message}`);
-        }
+      // 批次之间添加间隔，避免API限流
+      if (j + maxWorkers < testModels.length || i < iterations - 1) {
+        await new Promise((resolve) => setTimeout(resolve, intervalMs));
       }
     }
   }
 
+  const endTime = new Date();
+  const totalTimeInSeconds = (endTime - startTime) / 1000;
+  console.log(`\n基准测试结束: ${endTime.toLocaleString()}`);
+  console.log(`总运行时间: ${totalTimeInSeconds.toFixed(2)} 秒`);
   console.log('所有基准测试完成！');
 
   // 分析报告
@@ -57,14 +111,13 @@ async function runBenchmarkWithInterval(iterations = 10, intervalMs = 1000) {
 
 function reportOutput(result) {
   // 生成文件名: 模型名_测试用例名_目标语言_日期.json
-  const fileName = `${result.modelName}_${result.testCaseName}_${result.targetLanguage || 'unknown'
-    }_${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
+  const fileName = `${result.modelName}_${result.testCaseName}_${result.targetLanguage || 'unknown'}_${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
   const filePath = path.join(reportDir, fileName);
 
   // 将结果保存为JSON文件
   fs.writeFileSync(filePath, JSON.stringify(result, null, 2));
 }
 
 // 运行基准测试
-// 每个模型运行1次，每次间隔1秒，确保总消耗token不超过0.2美元
-runBenchmarkWithInterval(1, 1000);
+// 每个模型运行2次，每次间隔1秒，确保总消耗token不超过0.2美元
+runBenchmarkWithInterval(2, 1000);
diff --git a/benchmark/worker.js b/benchmark/worker.js
@@ -0,0 +1,51 @@
+import { parentPort } from 'worker_threads';
+import { runBenchmark } from './benchmark.js';
+import { fileURLToPath } from 'url';
+import path from 'path';
+
+// 设置 __filename 和 __dirname
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// 监听来自主线程的消息
+parentPort.on('message', async ({ model, testCases, iteration, intervalMs }) => {
+  try {
+    for (const testCase of testCases) {
+      // 发送进度信息到主线程
+      parentPort.postMessage({
+        type: 'progress',
+        message: `执行基准测试 [${iteration}]: ${model.name} / ${testCase.name}`
+      });
+
+      const result = await runBenchmark(model, testCase);
+
+      // 发送结果到主线程
+      parentPort.postMessage({
+        type: 'result',
+        result,
+        model: model.name,
+        testCase: testCase.name,
+        timeCost: result.timeCost,
+        cost: result.inputTokensCost + result.outputTokensCost
+      });
+
+      // 测试用例之间添加间隔
+      if (testCase !== testCases[testCases.length - 1]) {
+        await new Promise((resolve) => setTimeout(resolve, intervalMs));
+      }
+    }
+
+    // 通知主线程该模型的所有测试已完成
+    parentPort.postMessage({
+      type: 'complete',
+      model: model.name
+    });
+  } catch (error) {
+    // 发送错误信息到主线程
+    parentPort.postMessage({
+      type: 'error',
+      error: error.message,
+      model: model.name
+    });
+  }
+});