ai-dynamo · nv-nmailhot · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · coderabbitai
@@ -58,11 +58,55 @@ runs:
         # Run pytest with detailed output and JUnit XML
         set +e  # Don't exit on test failures
 
+        # DEBUG: Run diagnostic commands inside the container first
+        echo "=========================================="
+        echo "🔍 DEBUG: Running container diagnostics"
+        echo "=========================================="
+
+        echo "📋 Checking container environment..."
+        docker run --rm ${{ inputs.image_tag }} bash -c "
+          echo '--- Container Info ---'
+          uname -a
+          echo ''
+          echo '--- Python Location ---'
+          which python || echo 'python not found'
+          which python3 || echo 'python3 not found'
+          echo ''
+          echo '--- Python Version ---'
+          python --version 2>&1 || python3 --version 2>&1 || echo 'Python not available'
+          echo ''
+          echo '--- Pytest Location ---'
+          which pytest || echo 'pytest not found in PATH'
+          echo ''
+          echo '--- Pytest Version ---'
+          pytest --version 2>&1 || echo 'pytest command failed'
+          echo ''
+          echo '--- PATH ---'
+          echo \$PATH
+          echo ''
+          echo '--- /workspace contents ---'
+          ls -la /workspace 2>/dev/null || echo '/workspace does not exist or is empty'
+          echo ''
+          echo '--- pip list (pytest related) ---'
+          pip list 2>/dev/null | grep -i pytest || echo 'pip list failed or no pytest packages'
+        " || echo "DEBUG: Container diagnostic command failed with exit code: $?"
+
+        echo "=========================================="
+        echo "🔍 DEBUG: Diagnostics complete"
+        echo "=========================================="
+
         # Determine docker runtime flags and pytest command based on dry_run mode
         if [[ "${{ inputs.dry_run }}" == "true" ]]; then
           echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
           GPU_FLAGS=""
-          PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
+          # DEBUG: First try importing key test dependencies to catch import-time errors
+          # Then run pytest with extra verbosity to see collection details
+          PYTEST_CMD="echo '=== DEBUG: Testing key imports ===' && \
+            python -c 'import dynamo; print(\"dynamo OK\")' 2>&1 || echo 'dynamo import FAILED' && \
+            python -c 'import tensorrt_llm; print(\"tensorrt_llm OK\")' 2>&1 || echo 'tensorrt_llm import FAILED' && \
+            python -c 'import torch; print(\"torch OK\")' 2>&1 || echo 'torch import FAILED' && \
+            echo '=== DEBUG: Running pytest with extra verbosity ===' && \
+            pytest -vvv --collect-only --tb=short -m \"${{ inputs.pytest_marks }}\" 2>&1"
         else
           echo "🚀 Running pytest in normal mode"
           PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
@@ -82,6 +126,10 @@ runs:
         chmod 777 "${TEST_RESULTS_DIR}"
         echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"
 
+        echo "🔧 DEBUG: About to run pytest with command: ${PYTEST_CMD}"
+        echo "🔧 DEBUG: GPU_FLAGS: ${GPU_FLAGS}"
+        echo "🔧 DEBUG: Image tag: ${{ inputs.image_tag }}"
+
         docker run ${GPU_FLAGS} --rm -w /workspace \
           --cpus=${NUM_CPUS} \
           --network host \
@@ -93,6 +141,15 @@ runs:
         TEST_EXIT_CODE=$?
         echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
         echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
+
+        # DEBUG: Additional exit code interpretation
+        if [[ ${TEST_EXIT_CODE} -eq 127 ]]; then
+          echo "❌ DEBUG: Exit code 127 = Command not found (pytest or bash missing from container)"
+        elif [[ ${TEST_EXIT_CODE} -eq 139 ]]; then
+          echo "❌ DEBUG: Exit code 139 = Segmentation fault (SIGSEGV) - likely a crash in pytest or imported modules"
+        elif [[ ${TEST_EXIT_CODE} -eq 137 ]]; then
+          echo "❌ DEBUG: Exit code 137 = OOM killed (SIGKILL)"
+        fi
 
         # Verify test results were written (only in normal mode)
         if [[ "${{ inputs.dry_run }}" != "true" ]]; then

@@ -6,6 +6,14 @@ name: Nightly CI pipeline
 on:
   schedule:
     - cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
+  # TEMPORARY: Enable PR and manual triggers for debugging exit codes 127/139
+  pull_request:
+    branches:
+      - main
+    paths:
+      - '.github/workflows/nightly-ci.yml'
+      - '.github/actions/pytest/**'
+  workflow_dispatch:  # Allow manual triggering from GitHub UI
 
 permissions:
   contents: read
@@ -28,7 +36,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        framework: [vllm, trtllm, sglang]
+        # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
+        framework: [trtllm]
     env:
       ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
     steps:
@@ -122,21 +131,22 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - framework: vllm
-            base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04'
-            runtime_image_tag: '12.9.0-runtime-ubuntu24.04'
-            cuda_version: '129'
-            torch_backend: 'cu129'
+          # TEMPORARILY DISABLED - focusing on trtllm arm64 127 exit code debugging
+          # - framework: vllm
+          #   base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04'
+          #   runtime_image_tag: '12.9.0-runtime-ubuntu24.04'
+          #   cuda_version: '129'
+          #   torch_backend: 'cu129'
           - framework: trtllm
             base_image_tag: '25.06-py3'
             runtime_image_tag: ''
             cuda_version: '129'
             torch_backend: 'cu129'
-          - framework: sglang
-            base_image_tag: ''
-            runtime_image_tag: ''
-            cuda_version: ''
-            torch_backend: ''
+          # - framework: sglang
+          #   base_image_tag: ''
+          #   runtime_image_tag: ''
+          #   cuda_version: ''
+          #   torch_backend: ''
     env:
       ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
     steps:
@@ -233,7 +243,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        framework: [vllm, trtllm, sglang]
+        # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
+        framework: [trtllm]
         arch:
           - arch: amd64
             runner: gpu-l40-amd64
@@ -304,7 +315,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        framework: [vllm, trtllm, sglang]
+        # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
+        framework: [trtllm]
         arch:
           - arch: amd64
             runner: gpu-l40-amd64
@@ -369,7 +381,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        framework: [vllm, trtllm, sglang]
+        # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
+        framework: [trtllm]
         arch:
           - arch: amd64
             runner: gpu-l40-amd64
@@ -435,7 +448,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        framework: [vllm, trtllm, sglang]
+        # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
+        framework: [trtllm]
         arch:
           - arch: amd64
             runner: gpu-l40-amd64