Skip to content

Commit 2a43095

Browse files
committed
feat(benchmark): 优化基准测试流程并添加分析工作流
- 新增 GitHub Actions 工作流,实现自动化分析和报告生成 - 重构基准测试代码,使用多线程提高执行效率 - 优化报告输出格式,增加总运行时间和批次信息 - 更新依赖库版本,提高代码兼容性和性能
1 parent 5cf7d80 commit 2a43095

File tree

4 files changed

+262
-49
lines changed

4 files changed

+262
-49
lines changed

.github/workflows/analyze.yml

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
name: Analyze
2+
3+
on:
4+
push:
5+
branches:
6+
- feat/benchmark
7+
workflow_dispatch:
8+
9+
permissions:
10+
contents: write
11+
pull-requests: write
12+
actions: write
13+
14+
jobs:
15+
analyze:
16+
runs-on: ubuntu-latest
17+
timeout-minutes: 30
18+
19+
steps:
20+
- name: Checkout code
21+
uses: actions/checkout@v4
22+
with:
23+
fetch-depth: 0
24+
ref: feat/benchmark
25+
token: ${{ secrets.GITHUB_TOKEN }}
26+
27+
- name: Setup Node.js
28+
uses: actions/setup-node@v4
29+
with:
30+
node-version: 20
31+
cache: 'npm'
32+
33+
- name: Setup PNPM
34+
uses: pnpm/action-setup@v4
35+
with:
36+
version: 9
37+
38+
- name: Cache dependencies
39+
uses: actions/cache@v4
40+
with:
41+
path: |
42+
**/node_modules
43+
~/.pnpm-store
44+
key: ${{ runner.os }}-pnpm-${{ hashFiles('**/pnpm-lock.yaml') }}
45+
restore-keys: |
46+
${{ runner.os }}-pnpm-
47+
48+
- name: Install dependencies
49+
run: pnpm install --frozen-lockfile
50+
continue-on-error: false
51+
52+
- name: Set environment variables
53+
run: |
54+
if [ -z "${{ secrets.OPENAI_BENCHMARK_API_KEY }}" ]; then
55+
echo "Error: OPENAI_API_KEY is not set"
56+
exit 1
57+
fi
58+
echo "OPENAI_BASE_URL=${{ secrets.OPENAI_BENCHMARK_BASE_URL }}" >> .env.local
59+
echo "OPENAI_API_KEY=${{ secrets.OPENAI_BENCHMARK_API_KEY }}" >> .env.local
60+
61+
- name: Build
62+
run: |
63+
pnpm build
64+
continue-on-error: false
65+
66+
- name: Run benchmark
67+
id: benchmark
68+
run: pnpm benchmark
69+
continue-on-error: true
70+
71+
- name: Run analysis
72+
if: steps.benchmark.outcome == 'success'
73+
run: pnpm analyze
74+
continue-on-error: true
75+
76+
- name: Upload report
77+
if: always()
78+
uses: actions/upload-artifact@v4
79+
with:
80+
name: benchmark-report-${{ github.sha }}
81+
path: benchmark
82+
retention-days: 7
83+
84+
- name: Configure Git
85+
run: |
86+
git config --local user.email "[email protected]"
87+
git config --local user.name "GitHub Action"
88+
git config --local core.autocrlf false
89+
git config --local core.safecrlf false
90+
git config advice.ignoredHook false
91+
92+
- name: Commit and Push Changes
93+
run: |
94+
if [[ -n $(git status --porcelain) ]]; then
95+
git add .
96+
git commit -m "feat: update benchmark results [skip ci]"
97+
git push origin feat/benchmark --force || {
98+
echo "Push failed, retrying after pull..."
99+
git pull --rebase origin feat/benchmark
100+
git push origin feat/benchmark --force
101+
}
102+
else
103+
echo "No changes to commit"
104+
fi
105+
env:
106+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
107+
108+
- name: Cleanup
109+
if: always()
110+
run: |
111+
rm -rf node_modules
112+
rm -rf .env.local
113+
114+

benchmark/evaluate.js

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* global console */
22
import { ChatOpenAI } from '@langchain/openai';
33
import { StructuredOutputParser } from 'langchain/output_parsers';
4-
import { ChatPromptTemplate } from '@langchain/core/prompts';
4+
import { PromptTemplate } from '@langchain/core/prompts';
55
import { config } from 'dotenv';
66
import { z } from 'zod';
77

@@ -13,8 +13,8 @@ config({ path: '.env.local' });
1313
* @param {string} modelName - 模型名称
1414
* @returns {ChatOpenAI} - LangChain Chat模型实例
1515
*/
16-
export function getEvaluatorForModel(modelName) {
17-
let model = new ChatOpenAI({
16+
function getEvaluatorForModel(modelName) {
17+
return new ChatOpenAI({
1818
model: modelName,
1919
openAIApiKey: process.env.OPENAI_API_KEY,
2020
temperature: 0,
@@ -23,8 +23,6 @@ export function getEvaluatorForModel(modelName) {
2323
baseUrl: process.env.OPENAI_BASE_URL,
2424
},
2525
});
26-
27-
return model;
2826
}
2927

3028
/**
@@ -83,10 +81,7 @@ export async function evaluate(
8381
);
8482

8583
// 构建评估提示
86-
const formatInstructions = parser.getFormatInstructions();
87-
88-
const prompt = ChatPromptTemplate.fromTemplate(
89-
`你是一位精通多种语言的专业翻译评估专家,需要对机器翻译的质量进行评估。
84+
const prompt = PromptTemplate.fromTemplate(`你是一位精通多种语言的专业翻译评估专家,需要对机器翻译的质量进行评估。
9085
9186
原始文本 (Markdown格式):
9287
\`\`\`
@@ -110,20 +105,20 @@ export async function evaluate(
110105
111106
最后,提供具体的问题列表和改进建议。
112107
113-
${formatInstructions}
108+
{formatInstructions}
114109
`);
115110

116111
try {
117-
const response = await evaluator.invoke([
118-
await prompt.formatMessages({
119-
originalData,
120-
translatedData,
121-
targetLanguage,
122-
}),
123-
]);
124-
125-
const result = await parser.parse(response.content);
126-
return result;
112+
const result = await prompt.pipe(evaluator).invoke({
113+
originalData,
114+
translatedData,
115+
targetLanguage,
116+
formatInstructions: parser.getFormatInstructions(),
117+
});
118+
119+
const parsed = await parser.parse(result.content);
120+
console.log(parsed);
121+
return parsed;
127122
} catch (error) {
128123
console.error('评估过程出错:', error);
129124
// 返回默认评估结果

benchmark/index.js

Lines changed: 82 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,104 @@
1-
/* global console, setTimeout */
1+
/* global console */
22

3-
import { runBenchmark } from './benchmark.js';
3+
import { Worker } from 'worker_threads';
44
import { testModels, testCases } from './testsuit.js';
55
import { analyzeReports } from './analyze.js';
66
import path from 'path';
77
import fs from 'fs';
88
import { fileURLToPath } from 'url';
99

10+
// 获取当前文件的目录路径
11+
const __filename = fileURLToPath(import.meta.url);
12+
const __dirname = path.dirname(__filename);
13+
1014
// 获取 report 目录的路径
11-
const reportDir = path.join(path.dirname(fileURLToPath(import.meta.url)), 'report');
15+
const reportDir = path.join(__dirname, 'report');
1216

1317
// 确保 report 目录存在
1418
if (!fs.existsSync(reportDir)) {
1519
fs.mkdirSync(reportDir, { recursive: true });
1620
}
1721

18-
async function runBenchmarkWithInterval(iterations = 10, intervalMs = 1000) {
19-
for (let i = 0; i < iterations; i++) {
20-
for (const model of testModels) {
21-
for (const testCase of testCases) {
22-
console.log(`执行基准测试 [${i + 1}/${iterations}]: ${model.name} / ${testCase.name}`);
22+
// 创建worker实例
23+
function createWorker(model, iteration, intervalMs) {
24+
return new Promise((resolve, reject) => {
25+
const workerPath = path.join(__dirname, 'worker.js');
26+
const worker = new Worker(workerPath);
2327

24-
try {
25-
const result = await runBenchmark(model, testCase);
26-
reportOutput(result);
28+
worker.on('message', (message) => {
29+
switch (message.type) {
30+
case 'progress':
31+
console.log(message.message);
32+
break;
33+
case 'result':
34+
reportOutput(message.result);
2735
console.log(
28-
`完成! 耗时: ${result.timeCost.toFixed(2)}s, 成本: $${(
29-
result.inputTokensCost + result.outputTokensCost
30-
).toFixed(5)}`,
36+
`完成! ${message.model} / ${message.testCase} (耗时: ${message.timeCost.toFixed(2)}s, 成本: $${message.cost.toFixed(5)})`
3137
);
38+
break;
39+
case 'complete':
40+
console.log(`模型 ${message.model} 的所有测试完成`);
41+
resolve();
42+
worker.terminate();
43+
break;
44+
case 'error':
45+
console.error(`基准测试失败 (${message.model}): ${message.error}`);
46+
resolve();
47+
worker.terminate();
48+
break;
49+
}
50+
});
51+
52+
worker.on('error', reject);
53+
worker.on('exit', (code) => {
54+
if (code !== 0) {
55+
reject(new Error(`Worker stopped with exit code ${code}`));
56+
}
57+
});
58+
59+
// 将测试用例数据也传递给worker
60+
worker.postMessage({
61+
model,
62+
testCases,
63+
iteration,
64+
intervalMs
65+
});
66+
});
67+
}
68+
69+
async function runBenchmarkWithInterval(iterations = 10, intervalMs = 1000) {
70+
const startTime = new Date();
71+
console.log(`开始基准测试: ${startTime.toLocaleString()}`);
72+
console.log(`加载了${testModels.length}个模型`);
73+
74+
// 设置并发执行的worker数量
75+
const maxWorkers = 3;
76+
77+
for (let i = 0; i < iterations; i++) {
78+
// 将模型列表分成多个批次
79+
for (let j = 0; j < testModels.length; j += maxWorkers) {
80+
const modelBatch = testModels.slice(j, Math.min(j + maxWorkers, testModels.length));
81+
console.log(`\n执行第 ${i + 1}/${iterations} 轮,批次 ${Math.floor(j / maxWorkers) + 1}/${Math.ceil(testModels.length / maxWorkers)}`);
82+
83+
// 创建并发worker
84+
const workerPromises = modelBatch.map(model =>
85+
createWorker(model, i + 1, intervalMs)
86+
);
87+
88+
// 等待当前批次完成
89+
await Promise.all(workerPromises);
3290

33-
// 添加间隔,避免API限流
34-
if (
35-
i < iterations - 1 ||
36-
model !== testModels[testModels.length - 1] ||
37-
testCase !== testCases[testCases.length - 1]
38-
) {
39-
await new Promise((resolve) => setTimeout(resolve, intervalMs));
40-
}
41-
} catch (error) {
42-
console.error(`基准测试失败: ${error.message}`);
43-
}
91+
// 批次之间添加间隔,避免API限流
92+
if (j + maxWorkers < testModels.length || i < iterations - 1) {
93+
await new Promise((resolve) => setTimeout(resolve, intervalMs));
4494
}
4595
}
4696
}
4797

98+
const endTime = new Date();
99+
const totalTimeInSeconds = (endTime - startTime) / 1000;
100+
console.log(`\n基准测试结束: ${endTime.toLocaleString()}`);
101+
console.log(`总运行时间: ${totalTimeInSeconds.toFixed(2)} 秒`);
48102
console.log('所有基准测试完成!');
49103

50104
// 分析报告
@@ -57,14 +111,13 @@ async function runBenchmarkWithInterval(iterations = 10, intervalMs = 1000) {
57111

58112
function reportOutput(result) {
59113
// 生成文件名: 模型名_测试用例名_目标语言_日期.json
60-
const fileName = `${result.modelName}_${result.testCaseName}_${result.targetLanguage || 'unknown'
61-
}_${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
114+
const fileName = `${result.modelName}_${result.testCaseName}_${result.targetLanguage || 'unknown'}_${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
62115
const filePath = path.join(reportDir, fileName);
63116

64117
// 将结果保存为JSON文件
65118
fs.writeFileSync(filePath, JSON.stringify(result, null, 2));
66119
}
67120

68121
// 运行基准测试
69-
// 每个模型运行1次,每次间隔1秒,确保总消耗token不超过0.2美元
70-
runBenchmarkWithInterval(1, 1000);
122+
// 每个模型运行2次,每次间隔1秒,确保总消耗token不超过0.2美元
123+
runBenchmarkWithInterval(2, 1000);

benchmark/worker.js

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import { parentPort } from 'worker_threads';
2+
import { runBenchmark } from './benchmark.js';
3+
import { fileURLToPath } from 'url';
4+
import path from 'path';
5+
6+
// 设置 __filename 和 __dirname
7+
const __filename = fileURLToPath(import.meta.url);
8+
const __dirname = path.dirname(__filename);
9+
10+
// 监听来自主线程的消息
11+
parentPort.on('message', async ({ model, testCases, iteration, intervalMs }) => {
12+
try {
13+
for (const testCase of testCases) {
14+
// 发送进度信息到主线程
15+
parentPort.postMessage({
16+
type: 'progress',
17+
message: `执行基准测试 [${iteration}]: ${model.name} / ${testCase.name}`
18+
});
19+
20+
const result = await runBenchmark(model, testCase);
21+
22+
// 发送结果到主线程
23+
parentPort.postMessage({
24+
type: 'result',
25+
result,
26+
model: model.name,
27+
testCase: testCase.name,
28+
timeCost: result.timeCost,
29+
cost: result.inputTokensCost + result.outputTokensCost
30+
});
31+
32+
// 测试用例之间添加间隔
33+
if (testCase !== testCases[testCases.length - 1]) {
34+
await new Promise((resolve) => setTimeout(resolve, intervalMs));
35+
}
36+
}
37+
38+
// 通知主线程该模型的所有测试已完成
39+
parentPort.postMessage({
40+
type: 'complete',
41+
model: model.name
42+
});
43+
} catch (error) {
44+
// 发送错误信息到主线程
45+
parentPort.postMessage({
46+
type: 'error',
47+
error: error.message,
48+
model: model.name
49+
});
50+
}
51+
});

0 commit comments

Comments
 (0)