diff --git a/CHANGELOG.MD b/CHANGELOG.MD
index 381e1af..874e395 100644
--- a/CHANGELOG.MD
+++ b/CHANGELOG.MD
@@ -12,6 +12,7 @@ _released 04--2026
### Added
- **AI Evaluation Template Support**: Uploading test result support for TestRail's AI Evaluation Template with multi-dimensional quality ratings. See README "AI Evaluation Template Support" section for complete examples.
+ - **Multi-Step AI Evaluation Workflows**: Support for combining step-level execution tracking (`testrail_result_step`) with overall quality ratings in AI Evaluation tests. See README "Multi-Step AI Evaluation Workflows" section.
- **Global Quality Rating via `--result-fields`**: Added support for applying quality ratings to all test results using `--result-fields quality_rating:'{"category": value}'`. Test-specific quality ratings in XML/JSON properties take precedence over CLI global ratings.
## [1.14.1]
diff --git a/README.md b/README.md
index e7abcc6..aaa78ed 100644
--- a/README.md
+++ b/README.md
@@ -690,6 +690,79 @@ trcli parse_robot \
--suite-id 100
```
+### Multi-Step AI Evaluation Workflows
+
+For complex AI systems with multiple pipeline stages (like RAG, multi-agent systems, or sequential AI workflows), you can combine **step-level execution tracking** with **overall quality assessment** in your AI Evaluation tests. quality_rating result field can be added to to Test Case (Steps)
+
+#### How It Works
+
+**Step-Level Tracking:**
+- Each step has its own **status** (passed, failed, skipped, untested)
+- See exactly where in the pipeline the failure occurred
+
+**Overall Quality Rating:**
+- One **quality_rating** applies to the entire test result
+- Assess the final output quality across multiple dimensions
+
+#### JUnit XML Example
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+**Upload Command:**
+```bash
+trcli parse_junit \
+ -f rag_pipeline_results.xml \
+ --project-id 1 \
+ --suite-id 100
+```
+
+#### Important Notes
+
+1. **Quality Rating Scope**: The `quality_rating` applies to the **entire test result**, not individual steps. It represents the overall quality of the AI system's final output.
+
+2. **Step Status Format**: Use `status:description` format for step-level tracking:
+ - `passed:Step 1 Query Understanding`
+ - `failed:Step 3 Answer Generation`
+ - `skipped:Optional Enhancement`
+ - `untested:Step 4 Response Validation`
+
+3. **Available Step Statuses**:
+ - `passed` (status_id: 1) - Step completed successfully
+ - `untested` (status_id: 3) - Step not executed
+ - `skipped` (status_id: 4) - Step intentionally skipped
+ - `failed` (status_id: 5) - Step failed
+
+4. **Test Status Aggregation**: The overall test status follows **fail-fast** logic - if any step fails, the entire test fails.
+
## Behavior-Driven Development (BDD) Support
The TestRail CLI provides comprehensive support for Behavior-Driven Development workflows using Gherkin syntax. The BDD features enable you to manage test cases written in Gherkin format, execute BDD tests with various frameworks (Cucumber, Behave, pytest-bdd, etc.), and seamlessly upload results to TestRail.
diff --git a/tests/test_data/XML/sample_ai_eval_multistep_workflow.xml b/tests/test_data/XML/sample_ai_eval_multistep_workflow.xml
new file mode 100644
index 0000000..6f8220b
--- /dev/null
+++ b/tests/test_data/XML/sample_ai_eval_multistep_workflow.xml
@@ -0,0 +1,90 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Expected: Python is the primary language for machine learning
+ Actual: JavaScript is the primary language for machine learning
+
+ Issue: Model hallucinated incorrect information despite correct document retrieval
+ Impact: Users receive misleading information that could affect decision-making
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Expected: Retrieved at least 3 relevant documents about quantum mechanics
+ Actual: Retrieved 0 relevant documents (only found documents about classical physics)
+
+ Issue: Vector search embeddings failed to capture semantic meaning of quantum mechanics query
+ Impact: System cannot provide accurate answers for domain-specific questions
+ Recommendation: Retrain embedding model with physics-domain knowledge or use specialized vector database
+
+
+
+
+
+
diff --git a/tests/test_junit_quality_rating.py b/tests/test_junit_quality_rating.py
index 7555e78..116694d 100644
--- a/tests/test_junit_quality_rating.py
+++ b/tests/test_junit_quality_rating.py
@@ -259,3 +259,253 @@ def test_backward_compatibility_no_quality_rating(self, env, tmp_path):
assert "case_id" in result_dict
assert "status_id" in result_dict
assert "custom_field" in result_dict
+
+ # ========== Step-Level Results with Quality Rating ==========
+
+ def test_step_level_results_with_quality_rating(self, env, tmp_path):
+ """Test AI Evaluation with step-level results and overall quality rating"""
+ xml_content = """
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+ xml_file = tmp_path / "test_step_level_quality.xml"
+ xml_file.write_text(xml_content)
+
+ env.file = xml_file
+ parser = JunitParser(env)
+ suites = parser.parse_file()
+
+ test_case = suites[0].testsections[0].testcases[0]
+ result = test_case.result
+
+ # Verify step-level results
+ assert len(result.custom_step_results) == 4
+ assert result.custom_step_results[0].content == "Step 1 Query Understanding"
+ assert result.custom_step_results[0].status_id == 1 # Passed
+ assert result.custom_step_results[1].content == "Step 2 Document Retrieval"
+ assert result.custom_step_results[1].status_id == 1 # Passed
+ assert result.custom_step_results[2].content == "Step 3 Answer Generation"
+ assert result.custom_step_results[2].status_id == 5 # Failed
+ assert result.custom_step_results[3].content == "Step 4 Response Validation"
+ assert result.custom_step_results[3].status_id == 3 # Untested
+
+ # Verify overall quality rating
+ assert result.quality_rating == {"factual_accuracy": 2, "coherence": 3, "completeness": 1}
+
+ # Verify overall test status is failed
+ assert result.status_id == 5
+
+ def test_step_level_serialization_with_quality_rating(self, env, tmp_path):
+ """Test that step-level results and quality rating serialize correctly"""
+ xml_content = """
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+ xml_file = tmp_path / "test_step_serialization.xml"
+ xml_file.write_text(xml_content)
+
+ env.file = xml_file
+ parser = JunitParser(env)
+ suites = parser.parse_file()
+
+ test_case = suites[0].testsections[0].testcases[0]
+ result_dict = test_case.result.to_dict()
+
+ # Verify custom_step_results serialization
+ assert "custom_step_results" in result_dict
+ assert len(result_dict["custom_step_results"]) == 3
+ assert result_dict["custom_step_results"][0]["content"] == "Intent Detection"
+ assert result_dict["custom_step_results"][0]["status_id"] == 1
+ assert result_dict["custom_step_results"][1]["content"] == "Response Generation"
+ assert result_dict["custom_step_results"][1]["status_id"] == 1
+ assert result_dict["custom_step_results"][2]["content"] == "Quality Check"
+ assert result_dict["custom_step_results"][2]["status_id"] == 1
+
+ # Verify quality_rating at root level
+ assert "quality_rating" in result_dict
+ assert result_dict["quality_rating"] == {"accuracy": 5, "relevance": 5, "tone": 4}
+
+ def test_step_level_mixed_statuses(self, env, tmp_path):
+ """Test step-level results with various status combinations"""
+ xml_content = """
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+ xml_file = tmp_path / "test_mixed_steps.xml"
+ xml_file.write_text(xml_content)
+
+ env.file = xml_file
+ parser = JunitParser(env)
+ suites = parser.parse_file()
+
+ test_case = suites[0].testsections[0].testcases[0]
+ result = test_case.result
+
+ # Verify all step statuses
+ assert len(result.custom_step_results) == 3
+ assert result.custom_step_results[0].status_id == 1 # Passed
+ assert result.custom_step_results[1].status_id == 4 # Skipped
+ assert result.custom_step_results[2].status_id == 1 # Passed
+
+ # Overall test should pass (no failures)
+ assert result.status_id == 1
+
+ # Quality rating should be preserved
+ assert result.quality_rating == {"quality": 4}
+
+ def test_step_level_without_quality_rating(self, env, tmp_path):
+ """Test that step-level results work without quality rating (backward compatibility)"""
+ xml_content = """
+
+
+
+
+
+
+
+
+
+
+"""
+
+ xml_file = tmp_path / "test_steps_no_rating.xml"
+ xml_file.write_text(xml_content)
+
+ env.file = xml_file
+ parser = JunitParser(env)
+ suites = parser.parse_file()
+
+ test_case = suites[0].testsections[0].testcases[0]
+ result_dict = test_case.result.to_dict()
+
+ # Should have steps
+ assert "custom_step_results" in result_dict
+ assert len(result_dict["custom_step_results"]) == 2
+
+ # Should NOT have quality_rating
+ assert "quality_rating" not in result_dict
+
+ def test_quality_rating_without_steps(self, env, tmp_path):
+ """Test that quality rating works without step-level results"""
+ xml_content = """
+
+
+
+
+
+
+
+
+
+"""
+
+ xml_file = tmp_path / "test_rating_no_steps.xml"
+ xml_file.write_text(xml_content)
+
+ env.file = xml_file
+ parser = JunitParser(env)
+ suites = parser.parse_file()
+
+ test_case = suites[0].testsections[0].testcases[0]
+ result_dict = test_case.result.to_dict()
+
+ # Should have quality_rating
+ assert "quality_rating" in result_dict
+ assert result_dict["quality_rating"] == {"accuracy": 5}
+
+ # Should NOT have custom_step_results (empty list skipped by serialization)
+ assert "custom_step_results" not in result_dict or result_dict["custom_step_results"] == []
+
+ def test_parse_sample_multistep_workflow(self, env):
+ """Test parsing the sample multi-step AI evaluation workflow file"""
+ env.file = Path(__file__).parent / "test_data/XML/sample_ai_eval_multistep_workflow.xml"
+ parser = JunitParser(env)
+ suites = parser.parse_file()
+
+ assert len(suites) == 1
+ suite = suites[0]
+ assert len(suite.testsections) == 1
+ section = suite.testsections[0]
+ assert len(section.testcases) == 3
+
+ # Test 1: All steps pass
+ test1 = section.testcases[0]
+ assert test1.result.case_id == 1000
+ assert test1.result.status_id == 1 # Passed
+ assert len(test1.result.custom_step_results) == 4
+ assert all(step.status_id == 1 for step in test1.result.custom_step_results) # All passed
+ assert test1.result.quality_rating == {
+ "factual_accuracy": 5,
+ "coherence": 5,
+ "completeness": 4,
+ "relevance": 5,
+ }
+
+ # Test 2: Step 3 fails
+ test2 = section.testcases[1]
+ assert test2.result.case_id == 1001
+ assert test2.result.status_id == 5 # Failed
+ assert len(test2.result.custom_step_results) == 4
+ assert test2.result.custom_step_results[0].status_id == 1 # Step 1 passed
+ assert test2.result.custom_step_results[1].status_id == 1 # Step 2 passed
+ assert test2.result.custom_step_results[2].status_id == 5 # Step 3 failed
+ assert test2.result.custom_step_results[3].status_id == 3 # Step 4 untested
+ assert test2.result.quality_rating == {
+ "factual_accuracy": 1,
+ "coherence": 3,
+ "completeness": 2,
+ "relevance": 2,
+ }
+
+ # Test 3: Step 2 fails
+ test3 = section.testcases[2]
+ assert test3.result.case_id == 1002
+ assert test3.result.status_id == 5 # Failed
+ assert len(test3.result.custom_step_results) == 4
+ assert test3.result.custom_step_results[0].status_id == 1 # Step 1 passed
+ assert test3.result.custom_step_results[1].status_id == 5 # Step 2 failed
+ assert test3.result.custom_step_results[2].status_id == 3 # Step 3 untested
+ assert test3.result.custom_step_results[3].status_id == 3 # Step 4 untested
+ assert test3.result.quality_rating == {
+ "factual_accuracy": 0,
+ "coherence": 1,
+ "completeness": 0,
+ "relevance": 1,
+ }