evaleval
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval_converters/helm/adapter.py‎
Lines changed: 24 additions & 23 deletions b/‎eval_converters/helm/adapter.py‎
Lines changed: 24 additions & 23 deletions
diff --git a/‎schema/eval.schema.json‎
Lines changed: 37 additions & 7 deletions b/‎schema/eval.schema.json‎
Lines changed: 37 additions & 7 deletions
diff --git a/‎schema/eval_types.py‎
Lines changed: 8 additions & 4 deletions b/‎schema/eval_types.py‎
Lines changed: 8 additions & 4 deletions
@@ -111,6 +111,6 @@ options:
 Run below script to perform unit tests for all evaluation platforms.
 
 ```bash
-uv run pytest -s
+uv run pytest -s --disable-warnings
 uv run ruff check 
 ```
@@ -126,12 +126,12 @@ def transform_from_directory(
 			print(f'Error during conversion to unified schema in directory "{dir_path}": {e}')
 			return None
 
-	def _get_correct_response(self, references: List['Reference']) -> Optional[str]:
+	def _get_correct_response(self, references: List[Reference]) -> Optional[List[str]]:
 		"""Extracts the text of the first reference that has tags."""
-		for ref in references:
-			if ref.tags:
-				return ref.output.text
-		return None
+		return [
+			ref.output.text
+			for ref in references if ref.tags
+		]
 
 	def _extract_detailed_evaluation_info_for_samples(
 		self, request_states: List[RequestState]
@@ -143,23 +143,23 @@ def _extract_detailed_evaluation_info_for_samples(
 
 		for state in request_states:
 			references = state.instance.references or []
-			correct_response = self._get_correct_response(references)
+			correct_responses = self._get_correct_response(references)
 
 			ground_truth = None
-			if correct_response:
-				ground_truth = next(
-					(
-						choice 
-						for choice, response in state.output_mapping.items() 
-						if response in correct_response
-					),
-					None
-				)
-
-			choices_list = [
-				f'{choice}. {response}' 
-				for choice, response in state.output_mapping.items()
-			]
+			choices_list = None
+			
+			if state.output_mapping:
+				choices_list = [
+					[choice, response] for choice, response in state.output_mapping.items()
+				]
+				
+				ground_truth = [
+					choice for choice, response in state.output_mapping.items() 
+					if choice in correct_responses or response in correct_responses
+				]
+				
+			elif correct_responses:
+				ground_truth = correct_responses
 
 			results.append(
 				DetailedEvaluationResultsPerSample(
@@ -230,11 +230,11 @@ def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) ->
 
 		source_data = SourceData(
             dataset_name=scenario_dict.get('name'),
-            samples_number=len(request_states),
+            samples_number=len(set(state.instance.id for state in request_states)),#len(request_states),
             sample_ids=[state.instance.id for state in request_states],
 			additional_details={
 				'scenario_name': run_spec.scenario_spec.class_name,
-				'subject': run_spec.scenario_spec.args.get('subject')
+				'scenario_args': run_spec.scenario_spec.args
 			}
         )
 
@@ -256,9 +256,10 @@ def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) ->
 		for metric_name in metric_names:
 			metric_config = MetricConfig(
 				evaluation_description=metric_name,
-				lower_is_better=False
+				lower_is_better=False # TODO is not always true, possible to fetch correct value from schema.json
 			)
 
+			# TODO consider to filter out a subset of relevant stats
 			for stat in stats:
 				if not stat.name.name.startswith(metric_name):
 					continue
 
@@ -360,19 +360,49 @@
                                 "description": "Full prompt for the model"
                             },
                             "ground_truth": {
-                                "type": "string",
-                                "description": "Target response"
+                                "description": "Target response that may include one or multiple correct answers.",
+                                "oneOf": [
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array",
+                                        "items": { 
+                                            "type": "string" 
+                                        }
+                                    }
+                                ]
                             },
                             "response": {
                                 "type": "string",
                                 "description": "Response from the model"
                             },
                             "choices": {
-                                "type": "array",
-                                "description": "Array of possible responses",
-                                "items": {
-                                    "type": "string"
-                                }
+                                "description": "Either an array of possible responses (list of strings) or an array of string pairs [choice, response].",
+                                "oneOf": [
+                                    {
+                                        "type": "array",
+                                        "items": { 
+                                            "type": "string" 
+                                        }
+                                    },
+                                    {
+                                        "type": "array",
+                                        "items": {
+                                            "type": "array",
+                                            "items": [
+                                                {
+                                                    "type": "string"
+                                                },
+                                                {
+                                                    "type": "string"
+                                                }
+                                            ],
+                                            "minItems": 2,
+                                            "maxItems": 2
+                                        }
+                                    }
+                                ]
                             },
                             "full_logprobs": {
                                 "type": "array",
 
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-11-11T00:26:04+00:00
+#   timestamp: 2025-11-18T23:38:04+00:00
 
 from __future__ import annotations
 
@@ -153,10 +153,14 @@ class DetailedEvaluationResultsPerSample(BaseModel):
     sample_id: str = Field(..., description='Simple sample ID')
     input: str = Field(..., description='Raw input for the model')
     prompt: Optional[str] = Field(None, description='Full prompt for the model')
-    ground_truth: str = Field(..., description='Target response')
+    ground_truth: Union[str, List[str]] = Field(
+        ...,
+        description='Target response that may include one or multiple correct answers.',
+    )
     response: str = Field(..., description='Response from the model')
-    choices: Optional[List[str]] = Field(
-        None, description='Array of possible responses'
+    choices: Optional[Union[List[str], List[List[str]]]] = Field(
+        None,
+        description='Either an array of possible responses (list of strings) or an array of string pairs [choice, response].',
     )
     full_logprobs: Optional[List[List[FullLogprob]]] = Field(
         None, description='Full log probabilities generated for this sample'