ai-dynamo
diff --git a/‎.github/actions/docker-build/action.yml‎
Lines changed: 20 additions & 0 deletions b/‎.github/actions/docker-build/action.yml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎.github/actions/pytest/action.yml‎
Lines changed: 20 additions & 35 deletions b/‎.github/actions/pytest/action.yml‎
Lines changed: 20 additions & 35 deletions
diff --git a/‎.github/workflows/container-validation-backends.yml‎
Lines changed: 71 additions & 52 deletions b/‎.github/workflows/container-validation-backends.yml‎
Lines changed: 71 additions & 52 deletions
diff --git a/‎components/src/dynamo/planner/utils/planner_core.py‎
Lines changed: 12 additions & 1 deletion b/‎components/src/dynamo/planner/utils/planner_core.py‎
Lines changed: 12 additions & 1 deletion
@@ -151,6 +151,26 @@ runs:
         # Exit with the build's exit code
         exit ${BUILD_EXIT_CODE}
 
+    - name: Run Sanity Check on Runtime Image
+      if: inputs.target == 'runtime'
+      shell: bash
+      run: |
+        IMAGE_TAG="${{ steps.build.outputs.image_tag }}"
+        echo "Running sanity check on image: $IMAGE_TAG"
+
+        # Run the sanity check script inside the container
+        # The script is located in /workspace/deploy/sanity_check.py in runtime containers
+        set +e
+        docker run --rm "$IMAGE_TAG" python /workspace/deploy/sanity_check.py --runtime-check --no-gpu-check
+        SANITY_CHECK_EXIT_CODE=$?
+        set -e
+        if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
+          echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
+          exit ${SANITY_CHECK_EXIT_CODE}
+        else
+          echo "✅ Sanity check passed"
+        fi
+
     - name: Capture Build Metrics
       id: metrics
       shell: bash
 
@@ -77,23 +77,31 @@ runs:
           fi
         fi
 
-        # Run without --rm so we can copy results even if container crashes (example SIGSEGV exit 139)
-        docker run ${GPU_FLAGS} -w /workspace \
+        # Get absolute path for test-results directory and ensure it has proper permissions
+        TEST_RESULTS_DIR="$(pwd)/test-results"
+        chmod 777 "${TEST_RESULTS_DIR}"
+        echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"
+
+        docker run ${GPU_FLAGS} --rm -w /workspace \
           --cpus=${NUM_CPUS} \
           --network host \
           --name ${{ env.CONTAINER_ID }}_pytest \
+          -v "${TEST_RESULTS_DIR}:/workspace/test-results" \
           ${{ inputs.image_tag }} \
-          bash -c "mkdir -p /workspace/test-results && ${PYTEST_CMD}"
+          bash -c "${PYTEST_CMD}"
 
         TEST_EXIT_CODE=$?
         echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
         echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
 
-        # Copy test results from container to host
-        docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/test-results . || echo "Failed to copy test results"
-
-        # Clean up container
-        docker rm -f ${{ env.CONTAINER_ID }}_pytest || echo "Failed to clean up container"
+        # Verify test results were written (only in normal mode)
+        if [[ "${{ inputs.dry_run }}" != "true" ]]; then
+          if [[ -f "${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}" ]]; then
+            echo "✅ Test results file found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
+          else
+            echo "⚠️  Test results file not found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
+          fi
+        fi
 
         # Always continue to results processing
         exit 0
@@ -107,13 +115,6 @@ runs:
         STR_TEST_TYPE=$(echo "${{ inputs.test_type }}" | tr ', ' '_')
         echo "STR_TEST_TYPE=${STR_TEST_TYPE}" >> $GITHUB_ENV
 
-        # Skip XML processing if in dry-run mode
-        if [[ "${{ inputs.dry_run }}" == "true" ]]; then
-          echo "✅ Dry-run mode: Test collection completed"
-          echo "⏭️  No JUnit XML generated (dry-run mode)"
-          exit 0
-        fi
-
         # Check for JUnit XML file and determine test status
         JUNIT_FILE="test-results/pytest_test_report.xml"
 
@@ -125,23 +126,9 @@ runs:
           ERROR_TESTS=$(grep -o 'errors="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
           echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)"
 
-          # Create uniquely named metadata file with step context information
-          # Use framework-testtype-arch to make it unique per test run
-          METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.json"
-          JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.xml"
-
           # Rename XML file to unique name
+          JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml"
           mv "$JUNIT_FILE" "test-results/$JUNIT_NAME"
-
-          echo '{' > "$METADATA_FILE"
-          echo '  "job_name": "${{ github.job }}",' >> "$METADATA_FILE"
-          echo '  "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE"
-          echo '  "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE"
-          echo '  "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE"
-          echo '  "junit_xml_file": "'"$JUNIT_NAME"'",' >> "$METADATA_FILE"
-          echo '  "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE"
-          echo '}' >> "$METADATA_FILE"
-          echo "📝 Created test metadata file: $METADATA_FILE"
           echo "📝 Renamed XML file to: $JUNIT_NAME"
         else
           echo "⚠️  JUnit XML file not found - test results may not be available for upload"
@@ -155,10 +142,8 @@ runs:
 
     - name: Upload Test Results
       uses: actions/upload-artifact@v4
-      if: always() && inputs.dry_run != 'true'  # Skip upload in dry-run mode
+      if: always()  # Always upload test results, even if tests failed
       with:
-        name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
-        path: |
-          test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
-          test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
+        name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
+        path: test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
         retention-days: 7
@@ -76,6 +76,9 @@ jobs:
         with:
           aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
           aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
       - name: Linter
         shell: bash
         env:
@@ -416,6 +419,7 @@ jobs:
           export KUBECONFIG=$(pwd)/.kubeconfig
           kubectl config set-context --current --namespace=$NAMESPACE
       - name: Run Fault Tolerance Tests
+        id: run-ft-tests
         run: |
           set -x
           export KUBECONFIG=$(pwd)/.kubeconfig
@@ -437,14 +441,49 @@ jobs:
           pip install -r container/deps/requirements.test.txt
           pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
 
-          # Run the pytest command (tests orchestrate K8s, don't need dynamo package)
+          # Create test-results directory
+          mkdir -p test-results
+
+          # Run the pytest command with JUnit XML output
+          set +e  # Don't exit on test failures
           pytest tests/fault_tolerance/deploy/test_deployment.py \
             -m 'k8s and fault_tolerance' \
             -k '${{ matrix.framework.test_scenario }}' \
             -s -v \
             --namespace ${NAMESPACE} \
             --image ${IMAGE} \
-            --client-type legacy
+            --client-type legacy \
+            --junitxml=test-results/pytest_ft_report.xml \
+            --tb=short
+
+          TEST_EXIT_CODE=$?
+          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
+          echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"
+
+          exit ${TEST_EXIT_CODE}
+        continue-on-error: true
+
+      - name: Process Fault Tolerance Test Results
+        if: always()
+        run: |
+          set -x
+
+          # Rename JUnit XML with unique naming if it exists
+          if [ -f "test-results/pytest_ft_report.xml" ]; then
+            mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml"
+            echo "✅ JUnit XML report renamed with unique identifier"
+          else
+            echo "⚠️  JUnit XML report not found"
+          fi
+
+      - name: Upload Fault Tolerance Test Results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }}
+          path: test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml
+          retention-days: 7
+
       - name: Cleanup
         if: always()
         timeout-minutes: 5
@@ -468,56 +507,6 @@ jobs:
           kubectl delete namespace $NAMESPACE || true
           echo "Namespace $NAMESPACE completed."
 
-  # Upload metrics for this workflow and all its jobs
-  upload-workflow-metrics:
-    name: Upload Workflow Metrics
-    runs-on: gitlab
-    if: always()  # Always run, even if other jobs fail
-    needs: [backend-status-check]  # Wait for the status check which waits for all build jobs
-
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.x'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests
-
-      - name: Download build metrics
-        uses: actions/download-artifact@v4
-        with:
-          pattern: build-metrics-*
-          path: build-metrics/
-          merge-multiple: true
-        continue-on-error: true  # Don't fail if artifacts don't exist
-
-      - name: Download test results
-        uses: actions/download-artifact@v4
-        with:
-          pattern: test-results-*
-          path: test-results/
-          merge-multiple: true
-        continue-on-error: true  # Don't fail if artifacts don't exist
-
-      - name: Upload Complete Workflow Metrics
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          WORKFLOW_INDEX: ${{ secrets.WORKFLOW_INDEX }}
-          JOB_INDEX: ${{ secrets.JOB_INDEX }}
-          STEPS_INDEX: ${{ secrets.STEPS_INDEX }}
-          # Container and test index configuration
-          CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
-          TEST_INDEX: ${{ secrets.TEST_INDEX }}
-        run: |
-          # Upload complete workflow metrics including container metrics
-          python3 .github/workflows/upload_complete_workflow_metrics.py
-
   deploy-operator:
     runs-on: cpu-amd-m5-2xlarge
     # TODO: Uncomment this when we have a way to test the deploy-operator job in CI.
@@ -637,13 +626,17 @@ jobs:
         kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
         kubectl config get-contexts
     - name: Run Tests
+      id: run-tests
       env:
         NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
       run: |
         set -x
         export KUBECONFIG=$(pwd)/.kubeconfig
         kubectl config set-context --current --namespace=$NAMESPACE
 
+        # Redirect all output to a log file while still showing it
+        exec > >(tee -a test-output.log) 2>&1
+
         cd examples/backends/$FRAMEWORK
         export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
         export KUBE_NS=$NAMESPACE
@@ -736,6 +729,32 @@ jobs:
           echo "Test passed: Response matches expected format and content"
         fi
         exit $TEST_RESULT
+      continue-on-error: true
+
+    - name: Process Deployment Test Results
+      if: always()
+      run: |
+        set -x
+
+        # Create test-results directory
+        mkdir -p test-results
+
+        # Copy and rename the test output log with unique naming
+        if [ -f "test-output.log" ]; then
+          cp test-output.log "test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log"
+          echo "✅ Test output log copied to test-results/"
+        else
+          echo "⚠️  test-output.log not found"
+        fi
+
+    - name: Upload Deployment Test Results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: test-results-${{ env.FRAMEWORK }}-deploy-${{ matrix.profile }}-amd64-${{ github.run_id }}-${{ job.check_run_id }}
+        path: test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log
+        retention-days: 7
+
     - name: Cleanup
       if: always()
       timeout-minutes: 5
 
@@ -24,7 +24,7 @@
     PrefillInterpolator,
 )
 from dynamo.planner.utils.pre_swept_results_utils import PreSweptResultsHelper
-from dynamo.planner.utils.prometheus import PrometheusAPIClient
+from dynamo.planner.utils.prometheus import MetricSource, PrometheusAPIClient
 from dynamo.planner.utils.trace_data_extractor import extract_metrics_from_mooncake
 from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -150,9 +150,20 @@ def __init__(
                 else:
                     raise ValueError(f"Invalid environment: {args.environment}")
 
+            # Use backend metrics for vLLM (queries vllm:* metrics directly from workers)
+            # Use frontend metrics for other backends (queries dynamo_frontend_* metrics)
+            metric_source = (
+                MetricSource.VLLM
+                if args.backend.lower() == "vllm"
+                else MetricSource.FRONTEND
+            )
+            logger.info(
+                f"Initializing Prometheus client with metric_source='{metric_source}' for backend '{args.backend}'"
+            )
             self.prometheus_api_client = PrometheusAPIClient(
                 args.metric_pulling_prometheus_endpoint,
                 args.namespace,
+                metric_source=metric_source,
             )
 
         self.num_req_predictor = LOAD_PREDICTORS[args.load_predictor](