ai-dynamo
diff --git a/‎.github/actions/docker-build/action.yml‎
Lines changed: 36 additions & 9 deletions b/‎.github/actions/docker-build/action.yml‎
Lines changed: 36 additions & 9 deletions
diff --git a/‎.github/actions/docker-login/action.yml‎
Lines changed: 46 additions & 0 deletions b/‎.github/actions/docker-login/action.yml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/actions/docker-tag-push/action.yml‎
Lines changed: 28 additions & 16 deletions b/‎.github/actions/docker-tag-push/action.yml‎
Lines changed: 28 additions & 16 deletions
diff --git a/‎.github/actions/pytest/action.yml‎
Lines changed: 38 additions & 31 deletions b/‎.github/actions/pytest/action.yml‎
Lines changed: 38 additions & 31 deletions
@@ -49,6 +49,12 @@ inputs:
   torch_backend:
     description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
     required: false
+  enable_kvbm:
+    description: 'Enable KVBM support (optional)'
+    required: false
+  dynamo_base_image:
+    description: 'Pre-built Dynamo base image to use instead of building from scratch'
+    required: false
 
 outputs:
   image_tag:
@@ -72,14 +78,9 @@ runs:
         aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
     - name: Login to NGC
       if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
-      shell: bash
-      run: |
-        echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
-    - name: Cleanup
-      if: always()
-      shell: bash
-      run: |
-        docker system prune -af
+      uses: ./.github/actions/docker-login
+      with:
+        ngc_ci_access_token: ${{ inputs.ngc_ci_access_token }}
     - name: Build image
       id: build
       shell: bash
@@ -125,6 +126,12 @@ runs:
         if [ -n "${{ inputs.torch_backend }}" ]; then
           EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
         fi
+        if [ -n "${{ inputs.dynamo_base_image }}" ]; then
+          EXTRA_ARGS+=" --dynamo-base-image ${{ inputs.dynamo_base_image }}"
+        fi
+        if [ -n "${{ inputs.enable_kvbm }}" ]; then
+          EXTRA_ARGS+=" --build-arg ENABLE_KVBM=${{ inputs.enable_kvbm }}"
+        fi
 
         # Execute build and capture output (show on console AND save to file)
         ./container/build.sh --tag "$IMAGE_TAG" \
@@ -144,6 +151,26 @@ runs:
         # Exit with the build's exit code
         exit ${BUILD_EXIT_CODE}
 
+    - name: Run Sanity Check on Runtime Image
+      if: inputs.target == 'runtime'
+      shell: bash
+      run: |
+        IMAGE_TAG="${{ steps.build.outputs.image_tag }}"
+        echo "Running sanity check on image: $IMAGE_TAG"
+
+        # Run the sanity check script inside the container
+        # The script is located in /workspace/deploy/sanity_check.py in runtime containers
+        set +e
+        docker run --rm "$IMAGE_TAG" python /workspace/deploy/sanity_check.py --runtime-check --no-gpu-check
+        SANITY_CHECK_EXIT_CODE=$?
+        set -e
+        if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
+          echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
+          exit ${SANITY_CHECK_EXIT_CODE}
+        else
+          echo "✅ Sanity check passed"
+        fi
+
     - name: Capture Build Metrics
       id: metrics
       shell: bash
@@ -289,7 +316,7 @@ runs:
       uses: actions/upload-artifact@v4
       if: always()
       with:
-        name: build-metrics-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
+        name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
         path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
         retention-days: 7
 
@@ -0,0 +1,46 @@
+name: 'Docker Login'
+description: 'Login to multiple container registries (ECR, NGC, ACR)'
+
+inputs:
+  ngc_ci_access_token:
+    description: 'NGC CI Access Token'
+    required: false
+  aws_default_region:
+    description: 'AWS Default Region'
+    required: false
+  aws_account_id:
+    description: 'AWS Account ID'
+    required: false
+  azure_acr_hostname:
+    description: 'Azure ACR hostname'
+    required: false
+  azure_acr_user:
+    description: 'Azure ACR user'
+    required: false
+  azure_acr_password:
+    description: 'Azure ACR password'
+    required: false
+
+runs:
+  using: "composite"
+  steps:
+    - name: ECR Login
+      shell: bash
+      if: ${{ inputs.aws_default_region != '' && inputs.aws_account_id != '' }}
+      env:
+        ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
+      run: |
+        set -euo pipefail
+        aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
+    - name: NGC Login
+      if: ${{ inputs.ngc_ci_access_token != '' }}
+      shell: bash
+      run: |
+        set -euo pipefail
+        echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+    - name: ACR Login
+      shell: bash
+      if: ${{ inputs.azure_acr_hostname != '' && inputs.azure_acr_user != '' && inputs.azure_acr_password != '' }}
+      run: |
+        set -euo pipefail
+        echo "${{ inputs.azure_acr_password }}" | docker login "${{ inputs.azure_acr_hostname }}" --username "${{ inputs.azure_acr_user }}" --password-stdin
@@ -1,11 +1,12 @@
+name: 'Docker Tag and Push'
 description: 'Tag and Push Docker Images'
 
 inputs:
   local_image:
     description: 'Local Image Name:Tag'
     required: true
-  push_tag:
-    description: 'Target Name:Tag'
+  push_tags:
+    description: 'Target Name:Tag (newline-separated list for multiple tags)'
     required: true
   aws_push:
     description: 'Push to AWS Boolean'
@@ -38,37 +39,48 @@ inputs:
     required: false
 
 outputs:
-  image_tag:
-    description: 'Image Tag'
-    value: ${{ inputs.push_tag }}
+  image_tags:
+    description: 'Image Tags'
+    value: ${{ inputs.push_tags }}
 
 runs:
   using: "composite"
   steps:
     - name: Set up Docker Buildx
       uses: docker/setup-buildx-action@v3
-    - name: ACR Login
-      shell: bash
-      if: ${{ inputs.azure_push == 'true' }}
-      run: |
-        echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
+
     - name: ECR Tag and Push
       shell: bash
       if: ${{ inputs.aws_push == 'true' }}
       env:
         LOCAL_IMAGE: ${{ inputs.local_image }}
-        PUSH_TAG: ${{ inputs.push_tag }}
+        PUSH_TAGS: ${{ inputs.push_tags }}
         ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
       run: |
-        docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
-        docker push ${ECR_HOSTNAME}/${PUSH_TAG}
+        set -euo pipefail
+        while IFS= read -r TAG; do
+          if [ -z "$TAG" ]; then
+            continue
+          fi
+          echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
+          docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
+          docker push "${ECR_HOSTNAME}/${TAG}"
+        done <<< "$PUSH_TAGS"
+
     - name: ACR Tag and Push
       shell: bash
       if: ${{ inputs.azure_push == 'true' }}
       env:
         LOCAL_IMAGE: ${{ inputs.local_image }}
-        PUSH_TAG: ${{ inputs.push_tag }}
+        PUSH_TAGS: ${{ inputs.push_tags }}
         AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
       run: |
-        docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
-        docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
+        set -euo pipefail
+        while IFS= read -r TAG; do
+          if [ -z "$TAG" ]; then
+            continue
+          fi
+          echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
+          docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
+          docker push "${AZURE_ACR_HOSTNAME}/${TAG}"
+        done <<< "$PUSH_TAGS"
@@ -24,6 +24,10 @@ inputs:
     description: 'Platform architecture (amd64, arm64)'
     required: false
     default: 'amd64'
+  dry_run:
+    description: 'Run pytest in dry-run mode (collect tests only, do not execute)'
+    required: false
+    default: 'false'
 
 
 runs:
@@ -54,31 +58,50 @@ runs:
         # Run pytest with detailed output and JUnit XML
         set +e  # Don't exit on test failures
 
-        # Detect GPU availability and conditionally add GPU flags
-        GPU_FLAGS=""
-        if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
-          echo "GPU detected, enabling GPU runtime"
-          GPU_FLAGS="--runtime=nvidia --gpus all"
+        # Determine docker runtime flags and pytest command based on dry_run mode
+        if [[ "${{ inputs.dry_run }}" == "true" ]]; then
+          echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
+          GPU_FLAGS=""
+          PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
         else
-          echo "No GPU detected, running in CPU-only mode"
+          echo "🚀 Running pytest in normal mode"
+          PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
+
+          # Detect GPU availability and conditionally add GPU flags
+          GPU_FLAGS=""
+          if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
+            echo "✓ GPU detected, enabling GPU runtime"
+            GPU_FLAGS="--runtime=nvidia --gpus all"
+          else
+            echo "⚠️  No GPU detected, running in CPU-only mode"
+          fi
         fi
 
+        # Get absolute path for test-results directory and ensure it has proper permissions
+        TEST_RESULTS_DIR="$(pwd)/test-results"
+        chmod 777 "${TEST_RESULTS_DIR}"
+        echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"
+
         docker run ${GPU_FLAGS} --rm -w /workspace \
           --cpus=${NUM_CPUS} \
           --network host \
           --name ${{ env.CONTAINER_ID }}_pytest \
+          -v "${TEST_RESULTS_DIR}:/workspace/test-results" \
           ${{ inputs.image_tag }} \
-          bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
+          bash -c "${PYTEST_CMD}"
 
         TEST_EXIT_CODE=$?
         echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
         echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
 
-        # Copy test results from container to host
-        docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/test-results . || echo "Failed to copy test results"
-
-        # Clean up container
-        docker rm -f ${{ env.CONTAINER_ID }}_pytest || echo "Failed to clean up container"
+        # Verify test results were written (only in normal mode)
+        if [[ "${{ inputs.dry_run }}" != "true" ]]; then
+          if [[ -f "${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}" ]]; then
+            echo "✅ Test results file found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
+          else
+            echo "⚠️  Test results file not found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
+          fi
+        fi
 
         # Always continue to results processing
         exit 0
@@ -103,23 +126,9 @@ runs:
           ERROR_TESTS=$(grep -o 'errors="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
           echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)"
 
-          # Create uniquely named metadata file with step context information
-          # Use framework-testtype-arch to make it unique per test run
-          METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.json"
-          JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.xml"
-
           # Rename XML file to unique name
+          JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml"
           mv "$JUNIT_FILE" "test-results/$JUNIT_NAME"
-
-          echo '{' > "$METADATA_FILE"
-          echo '  "job_name": "${{ github.job }}",' >> "$METADATA_FILE"
-          echo '  "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE"
-          echo '  "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE"
-          echo '  "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE"
-          echo '  "junit_xml_file": "'"$JUNIT_NAME"'",' >> "$METADATA_FILE"
-          echo '  "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE"
-          echo '}' >> "$METADATA_FILE"
-          echo "📝 Created test metadata file: $METADATA_FILE"
           echo "📝 Renamed XML file to: $JUNIT_NAME"
         else
           echo "⚠️  JUnit XML file not found - test results may not be available for upload"
@@ -135,8 +144,6 @@ runs:
       uses: actions/upload-artifact@v4
       if: always()  # Always upload test results, even if tests failed
       with:
-        name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
-        path: |
-          test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
-          test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
+        name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
+        path: test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
         retention-days: 7