diff --git a/CHANGELOG.MD b/CHANGELOG.MD index 381e1af..874e395 100644 --- a/CHANGELOG.MD +++ b/CHANGELOG.MD @@ -12,6 +12,7 @@ _released 04--2026 ### Added - **AI Evaluation Template Support**: Uploading test result support for TestRail's AI Evaluation Template with multi-dimensional quality ratings. See README "AI Evaluation Template Support" section for complete examples. + - **Multi-Step AI Evaluation Workflows**: Support for combining step-level execution tracking (`testrail_result_step`) with overall quality ratings in AI Evaluation tests. See README "Multi-Step AI Evaluation Workflows" section. - **Global Quality Rating via `--result-fields`**: Added support for applying quality ratings to all test results using `--result-fields quality_rating:'{"category": value}'`. Test-specific quality ratings in XML/JSON properties take precedence over CLI global ratings. ## [1.14.1] diff --git a/README.md b/README.md index e7abcc6..aaa78ed 100644 --- a/README.md +++ b/README.md @@ -690,6 +690,79 @@ trcli parse_robot \ --suite-id 100 ``` +### Multi-Step AI Evaluation Workflows + +For complex AI systems with multiple pipeline stages (like RAG, multi-agent systems, or sequential AI workflows), you can combine **step-level execution tracking** with **overall quality assessment** in your AI Evaluation tests. quality_rating result field can be added to to Test Case (Steps) + +#### How It Works + +**Step-Level Tracking:** +- Each step has its own **status** (passed, failed, skipped, untested) +- See exactly where in the pipeline the failure occurred + +**Overall Quality Rating:** +- One **quality_rating** applies to the entire test result +- Assess the final output quality across multiple dimensions + +#### JUnit XML Example + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +**Upload Command:** +```bash +trcli parse_junit \ + -f rag_pipeline_results.xml \ + --project-id 1 \ + --suite-id 100 +``` + +#### Important Notes + +1. **Quality Rating Scope**: The `quality_rating` applies to the **entire test result**, not individual steps. It represents the overall quality of the AI system's final output. + +2. **Step Status Format**: Use `status:description` format for step-level tracking: + - `passed:Step 1 Query Understanding` + - `failed:Step 3 Answer Generation` + - `skipped:Optional Enhancement` + - `untested:Step 4 Response Validation` + +3. **Available Step Statuses**: + - `passed` (status_id: 1) - Step completed successfully + - `untested` (status_id: 3) - Step not executed + - `skipped` (status_id: 4) - Step intentionally skipped + - `failed` (status_id: 5) - Step failed + +4. **Test Status Aggregation**: The overall test status follows **fail-fast** logic - if any step fails, the entire test fails. + ## Behavior-Driven Development (BDD) Support The TestRail CLI provides comprehensive support for Behavior-Driven Development workflows using Gherkin syntax. The BDD features enable you to manage test cases written in Gherkin format, execute BDD tests with various frameworks (Cucumber, Behave, pytest-bdd, etc.), and seamlessly upload results to TestRail. diff --git a/tests/test_data/XML/sample_ai_eval_multistep_workflow.xml b/tests/test_data/XML/sample_ai_eval_multistep_workflow.xml new file mode 100644 index 0000000..6f8220b --- /dev/null +++ b/tests/test_data/XML/sample_ai_eval_multistep_workflow.xml @@ -0,0 +1,90 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Expected: Python is the primary language for machine learning + Actual: JavaScript is the primary language for machine learning + + Issue: Model hallucinated incorrect information despite correct document retrieval + Impact: Users receive misleading information that could affect decision-making + + + + + + + + + + + + + + + + + + + + + + + + + Expected: Retrieved at least 3 relevant documents about quantum mechanics + Actual: Retrieved 0 relevant documents (only found documents about classical physics) + + Issue: Vector search embeddings failed to capture semantic meaning of quantum mechanics query + Impact: System cannot provide accurate answers for domain-specific questions + Recommendation: Retrain embedding model with physics-domain knowledge or use specialized vector database + + + + + + diff --git a/tests/test_junit_quality_rating.py b/tests/test_junit_quality_rating.py index 7555e78..116694d 100644 --- a/tests/test_junit_quality_rating.py +++ b/tests/test_junit_quality_rating.py @@ -259,3 +259,253 @@ def test_backward_compatibility_no_quality_rating(self, env, tmp_path): assert "case_id" in result_dict assert "status_id" in result_dict assert "custom_field" in result_dict + + # ========== Step-Level Results with Quality Rating ========== + + def test_step_level_results_with_quality_rating(self, env, tmp_path): + """Test AI Evaluation with step-level results and overall quality rating""" + xml_content = """ + + + + + + + + + + + + + + + + +""" + + xml_file = tmp_path / "test_step_level_quality.xml" + xml_file.write_text(xml_content) + + env.file = xml_file + parser = JunitParser(env) + suites = parser.parse_file() + + test_case = suites[0].testsections[0].testcases[0] + result = test_case.result + + # Verify step-level results + assert len(result.custom_step_results) == 4 + assert result.custom_step_results[0].content == "Step 1 Query Understanding" + assert result.custom_step_results[0].status_id == 1 # Passed + assert result.custom_step_results[1].content == "Step 2 Document Retrieval" + assert result.custom_step_results[1].status_id == 1 # Passed + assert result.custom_step_results[2].content == "Step 3 Answer Generation" + assert result.custom_step_results[2].status_id == 5 # Failed + assert result.custom_step_results[3].content == "Step 4 Response Validation" + assert result.custom_step_results[3].status_id == 3 # Untested + + # Verify overall quality rating + assert result.quality_rating == {"factual_accuracy": 2, "coherence": 3, "completeness": 1} + + # Verify overall test status is failed + assert result.status_id == 5 + + def test_step_level_serialization_with_quality_rating(self, env, tmp_path): + """Test that step-level results and quality rating serialize correctly""" + xml_content = """ + + + + + + + + + + + + +""" + + xml_file = tmp_path / "test_step_serialization.xml" + xml_file.write_text(xml_content) + + env.file = xml_file + parser = JunitParser(env) + suites = parser.parse_file() + + test_case = suites[0].testsections[0].testcases[0] + result_dict = test_case.result.to_dict() + + # Verify custom_step_results serialization + assert "custom_step_results" in result_dict + assert len(result_dict["custom_step_results"]) == 3 + assert result_dict["custom_step_results"][0]["content"] == "Intent Detection" + assert result_dict["custom_step_results"][0]["status_id"] == 1 + assert result_dict["custom_step_results"][1]["content"] == "Response Generation" + assert result_dict["custom_step_results"][1]["status_id"] == 1 + assert result_dict["custom_step_results"][2]["content"] == "Quality Check" + assert result_dict["custom_step_results"][2]["status_id"] == 1 + + # Verify quality_rating at root level + assert "quality_rating" in result_dict + assert result_dict["quality_rating"] == {"accuracy": 5, "relevance": 5, "tone": 4} + + def test_step_level_mixed_statuses(self, env, tmp_path): + """Test step-level results with various status combinations""" + xml_content = """ + + + + + + + + + + + + +""" + + xml_file = tmp_path / "test_mixed_steps.xml" + xml_file.write_text(xml_content) + + env.file = xml_file + parser = JunitParser(env) + suites = parser.parse_file() + + test_case = suites[0].testsections[0].testcases[0] + result = test_case.result + + # Verify all step statuses + assert len(result.custom_step_results) == 3 + assert result.custom_step_results[0].status_id == 1 # Passed + assert result.custom_step_results[1].status_id == 4 # Skipped + assert result.custom_step_results[2].status_id == 1 # Passed + + # Overall test should pass (no failures) + assert result.status_id == 1 + + # Quality rating should be preserved + assert result.quality_rating == {"quality": 4} + + def test_step_level_without_quality_rating(self, env, tmp_path): + """Test that step-level results work without quality rating (backward compatibility)""" + xml_content = """ + + + + + + + + + + +""" + + xml_file = tmp_path / "test_steps_no_rating.xml" + xml_file.write_text(xml_content) + + env.file = xml_file + parser = JunitParser(env) + suites = parser.parse_file() + + test_case = suites[0].testsections[0].testcases[0] + result_dict = test_case.result.to_dict() + + # Should have steps + assert "custom_step_results" in result_dict + assert len(result_dict["custom_step_results"]) == 2 + + # Should NOT have quality_rating + assert "quality_rating" not in result_dict + + def test_quality_rating_without_steps(self, env, tmp_path): + """Test that quality rating works without step-level results""" + xml_content = """ + + + + + + + + + +""" + + xml_file = tmp_path / "test_rating_no_steps.xml" + xml_file.write_text(xml_content) + + env.file = xml_file + parser = JunitParser(env) + suites = parser.parse_file() + + test_case = suites[0].testsections[0].testcases[0] + result_dict = test_case.result.to_dict() + + # Should have quality_rating + assert "quality_rating" in result_dict + assert result_dict["quality_rating"] == {"accuracy": 5} + + # Should NOT have custom_step_results (empty list skipped by serialization) + assert "custom_step_results" not in result_dict or result_dict["custom_step_results"] == [] + + def test_parse_sample_multistep_workflow(self, env): + """Test parsing the sample multi-step AI evaluation workflow file""" + env.file = Path(__file__).parent / "test_data/XML/sample_ai_eval_multistep_workflow.xml" + parser = JunitParser(env) + suites = parser.parse_file() + + assert len(suites) == 1 + suite = suites[0] + assert len(suite.testsections) == 1 + section = suite.testsections[0] + assert len(section.testcases) == 3 + + # Test 1: All steps pass + test1 = section.testcases[0] + assert test1.result.case_id == 1000 + assert test1.result.status_id == 1 # Passed + assert len(test1.result.custom_step_results) == 4 + assert all(step.status_id == 1 for step in test1.result.custom_step_results) # All passed + assert test1.result.quality_rating == { + "factual_accuracy": 5, + "coherence": 5, + "completeness": 4, + "relevance": 5, + } + + # Test 2: Step 3 fails + test2 = section.testcases[1] + assert test2.result.case_id == 1001 + assert test2.result.status_id == 5 # Failed + assert len(test2.result.custom_step_results) == 4 + assert test2.result.custom_step_results[0].status_id == 1 # Step 1 passed + assert test2.result.custom_step_results[1].status_id == 1 # Step 2 passed + assert test2.result.custom_step_results[2].status_id == 5 # Step 3 failed + assert test2.result.custom_step_results[3].status_id == 3 # Step 4 untested + assert test2.result.quality_rating == { + "factual_accuracy": 1, + "coherence": 3, + "completeness": 2, + "relevance": 2, + } + + # Test 3: Step 2 fails + test3 = section.testcases[2] + assert test3.result.case_id == 1002 + assert test3.result.status_id == 5 # Failed + assert len(test3.result.custom_step_results) == 4 + assert test3.result.custom_step_results[0].status_id == 1 # Step 1 passed + assert test3.result.custom_step_results[1].status_id == 5 # Step 2 failed + assert test3.result.custom_step_results[2].status_id == 3 # Step 3 untested + assert test3.result.custom_step_results[3].status_id == 3 # Step 4 untested + assert test3.result.quality_rating == { + "factual_accuracy": 0, + "coherence": 1, + "completeness": 0, + "relevance": 1, + }