Skip to content

Commit c2a9940

Browse files
committed
Fixed suggestions from code review and added tests
1 parent ee35721 commit c2a9940

File tree

11 files changed

+7652
-81
lines changed

11 files changed

+7652
-81
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,6 @@ options:
111111
Run below script to perform unit tests for all evaluation platforms.
112112

113113
```bash
114-
uv run pytest -s
114+
uv run pytest -s --disable-warnings
115115
uv run ruff check
116116
```

eval_converters/helm/adapter.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,12 @@ def transform_from_directory(
126126
print(f'Error during conversion to unified schema in directory "{dir_path}": {e}')
127127
return None
128128

129-
def _get_correct_response(self, references: List['Reference']) -> Optional[str]:
129+
def _get_correct_response(self, references: List[Reference]) -> Optional[List[str]]:
130130
"""Extracts the text of the first reference that has tags."""
131-
for ref in references:
132-
if ref.tags:
133-
return ref.output.text
134-
return None
131+
return [
132+
ref.output.text
133+
for ref in references if ref.tags
134+
]
135135

136136
def _extract_detailed_evaluation_info_for_samples(
137137
self, request_states: List[RequestState]
@@ -143,23 +143,23 @@ def _extract_detailed_evaluation_info_for_samples(
143143

144144
for state in request_states:
145145
references = state.instance.references or []
146-
correct_response = self._get_correct_response(references)
146+
correct_responses = self._get_correct_response(references)
147147

148148
ground_truth = None
149-
if correct_response:
150-
ground_truth = next(
151-
(
152-
choice
153-
for choice, response in state.output_mapping.items()
154-
if response in correct_response
155-
),
156-
None
157-
)
158-
159-
choices_list = [
160-
f'{choice}. {response}'
161-
for choice, response in state.output_mapping.items()
162-
]
149+
choices_list = None
150+
151+
if state.output_mapping:
152+
choices_list = [
153+
[choice, response] for choice, response in state.output_mapping.items()
154+
]
155+
156+
ground_truth = [
157+
choice for choice, response in state.output_mapping.items()
158+
if choice in correct_responses or response in correct_responses
159+
]
160+
161+
elif correct_responses:
162+
ground_truth = correct_responses
163163

164164
results.append(
165165
DetailedEvaluationResultsPerSample(
@@ -230,11 +230,11 @@ def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) ->
230230

231231
source_data = SourceData(
232232
dataset_name=scenario_dict.get('name'),
233-
samples_number=len(request_states),
233+
samples_number=len(set(state.instance.id for state in request_states)),#len(request_states),
234234
sample_ids=[state.instance.id for state in request_states],
235235
additional_details={
236236
'scenario_name': run_spec.scenario_spec.class_name,
237-
'subject': run_spec.scenario_spec.args.get('subject')
237+
'scenario_args': run_spec.scenario_spec.args
238238
}
239239
)
240240

@@ -256,9 +256,10 @@ def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) ->
256256
for metric_name in metric_names:
257257
metric_config = MetricConfig(
258258
evaluation_description=metric_name,
259-
lower_is_better=False
259+
lower_is_better=False # TODO is not always true, possible to fetch correct value from schema.json
260260
)
261261

262+
# TODO consider to filter out a subset of relevant stats
262263
for stat in stats:
263264
if not stat.name.name.startswith(metric_name):
264265
continue

schema/eval.schema.json

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -360,19 +360,49 @@
360360
"description": "Full prompt for the model"
361361
},
362362
"ground_truth": {
363-
"type": "string",
364-
"description": "Target response"
363+
"description": "Target response that may include one or multiple correct answers.",
364+
"oneOf": [
365+
{
366+
"type": "string"
367+
},
368+
{
369+
"type": "array",
370+
"items": {
371+
"type": "string"
372+
}
373+
}
374+
]
365375
},
366376
"response": {
367377
"type": "string",
368378
"description": "Response from the model"
369379
},
370380
"choices": {
371-
"type": "array",
372-
"description": "Array of possible responses",
373-
"items": {
374-
"type": "string"
375-
}
381+
"description": "Either an array of possible responses (list of strings) or an array of string pairs [choice, response].",
382+
"oneOf": [
383+
{
384+
"type": "array",
385+
"items": {
386+
"type": "string"
387+
}
388+
},
389+
{
390+
"type": "array",
391+
"items": {
392+
"type": "array",
393+
"items": [
394+
{
395+
"type": "string"
396+
},
397+
{
398+
"type": "string"
399+
}
400+
],
401+
"minItems": 2,
402+
"maxItems": 2
403+
}
404+
}
405+
]
376406
},
377407
"full_logprobs": {
378408
"type": "array",

schema/eval_types.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# generated by datamodel-codegen:
22
# filename: eval.schema.json
3-
# timestamp: 2025-11-11T00:26:04+00:00
3+
# timestamp: 2025-11-18T23:38:04+00:00
44

55
from __future__ import annotations
66

@@ -153,10 +153,14 @@ class DetailedEvaluationResultsPerSample(BaseModel):
153153
sample_id: str = Field(..., description='Simple sample ID')
154154
input: str = Field(..., description='Raw input for the model')
155155
prompt: Optional[str] = Field(None, description='Full prompt for the model')
156-
ground_truth: str = Field(..., description='Target response')
156+
ground_truth: Union[str, List[str]] = Field(
157+
...,
158+
description='Target response that may include one or multiple correct answers.',
159+
)
157160
response: str = Field(..., description='Response from the model')
158-
choices: Optional[List[str]] = Field(
159-
None, description='Array of possible responses'
161+
choices: Optional[Union[List[str], List[List[str]]]] = Field(
162+
None,
163+
description='Either an array of possible responses (list of strings) or an array of string pairs [choice, response].',
160164
)
161165
full_logprobs: Optional[List[List[FullLogprob]]] = Field(
162166
None, description='Full log probabilities generated for this sample'

0 commit comments

Comments
 (0)