Skip to content

Commit 6af63b9

Browse files
committed
Fixed HELM Capabilities case
1 parent 03cb72a commit 6af63b9

File tree

11 files changed

+40
-36
lines changed

11 files changed

+40
-36
lines changed
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1764176819.310097",
4-
"retrieved_timestamp": "1764176819.310097",
3+
"evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1764178796.5756102",
4+
"retrieved_timestamp": "1764178796.5756102",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 191.759
37+
"mean_eval_time": 191.759
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1764176819.310583",
4-
"retrieved_timestamp": "1764176819.310583",
3+
"evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1764178796.576065",
4+
"retrieved_timestamp": "1764178796.576065",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 103.939
37+
"mean_eval_time": 103.939
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1764176819.310952",
4-
"retrieved_timestamp": "1764176819.310952",
3+
"evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1764178796.576389",
4+
"retrieved_timestamp": "1764178796.576389",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 164.449
37+
"mean_eval_time": 164.449
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1764176819.311302",
4-
"retrieved_timestamp": "1764176819.311302",
3+
"evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1764178796.5766778",
4+
"retrieved_timestamp": "1764178796.5766778",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 449.115
37+
"mean_eval_time": 449.115
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1764176819.311538",
4-
"retrieved_timestamp": "1764176819.311538",
3+
"evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1764178796.576965",
4+
"retrieved_timestamp": "1764178796.576965",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 71.889
37+
"mean_eval_time": 71.889
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1764176819.309529",
4-
"retrieved_timestamp": "1764176819.309529",
3+
"evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1764178796.574767",
4+
"retrieved_timestamp": "1764178796.574767",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 118.552
37+
"mean_eval_time": 118.552
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1764176819.3127081",
4-
"retrieved_timestamp": "1764176819.3127081",
3+
"evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1764178796.578225",
4+
"retrieved_timestamp": "1764178796.578225",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 44.938
37+
"mean_eval_time": 44.938
3838
}
3939
},
4040
"generation_config": {}

data/helm_capabilities/openai/gpt-oss-120b/617b43a3-90bf-43aa-8fb1-64120d02fb4e.json renamed to data/helm_capabilities/openai/gpt-oss-120b/8d5fd3cf-23da-4718-a508-48734fa7d2b2.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1764176819.312159",
4-
"retrieved_timestamp": "1764176819.312159",
3+
"evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1764178796.5776498",
4+
"retrieved_timestamp": "1764178796.5776498",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 19.583
37+
"mean_eval_time": 19.583
3838
}
3939
},
4040
"generation_config": {}

data/helm_capabilities/openai/gpt-oss-20b/7ad3cab6-5b76-47fd-a793-633b4b7df2a8.json renamed to data/helm_capabilities/openai/gpt-oss-20b/fa4653b0-60c4-4de5-8715-b6767cb89111.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1764176819.312483",
4-
"retrieved_timestamp": "1764176819.312483",
3+
"evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1764178796.5779781",
4+
"retrieved_timestamp": "1764178796.5779781",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 31.785
37+
"mean_eval_time": 31.785
3838
}
3939
},
4040
"generation_config": {}
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"schema_version": "0.0.1",
3-
"evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1764176819.311798",
4-
"retrieved_timestamp": "1764176819.311798",
3+
"evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1764178796.577315",
4+
"retrieved_timestamp": "1764178796.577315",
55
"source_data": [
66
"https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.12.0/groups/core_scenarios.json"
77
],
@@ -34,7 +34,7 @@
3434
"details": {
3535
"accuracy_description": null,
3636
"efficiency_description": null,
37-
"eval_time_mean_win_rate": 36.156
37+
"mean_eval_time": 36.156
3838
}
3939
},
4040
"generation_config": {}

0 commit comments

Comments
 (0)