diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index 5cc89b4bc5..a29be4167f 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -58,11 +58,55 @@ runs: # Run pytest with detailed output and JUnit XML set +e # Don't exit on test failures + # DEBUG: Run diagnostic commands inside the container first + echo "==========================================" + echo "๐Ÿ” DEBUG: Running container diagnostics" + echo "==========================================" + + echo "๐Ÿ“‹ Checking container environment..." + docker run --rm ${{ inputs.image_tag }} bash -c " + echo '--- Container Info ---' + uname -a + echo '' + echo '--- Python Location ---' + which python || echo 'python not found' + which python3 || echo 'python3 not found' + echo '' + echo '--- Python Version ---' + python --version 2>&1 || python3 --version 2>&1 || echo 'Python not available' + echo '' + echo '--- Pytest Location ---' + which pytest || echo 'pytest not found in PATH' + echo '' + echo '--- Pytest Version ---' + pytest --version 2>&1 || echo 'pytest command failed' + echo '' + echo '--- PATH ---' + echo \$PATH + echo '' + echo '--- /workspace contents ---' + ls -la /workspace 2>/dev/null || echo '/workspace does not exist or is empty' + echo '' + echo '--- pip list (pytest related) ---' + pip list 2>/dev/null | grep -i pytest || echo 'pip list failed or no pytest packages' + " || echo "DEBUG: Container diagnostic command failed with exit code: $?" + + echo "==========================================" + echo "๐Ÿ” DEBUG: Diagnostics complete" + echo "==========================================" + # Determine docker runtime flags and pytest command based on dry_run mode if [[ "${{ inputs.dry_run }}" == "true" ]]; then echo "๐Ÿ” Running pytest in dry-run mode (collect-only, no GPU required)" GPU_FLAGS="" - PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\"" + # DEBUG: First try importing key test dependencies to catch import-time errors + # Then run pytest with extra verbosity to see collection details + PYTEST_CMD="echo '=== DEBUG: Testing key imports ===' && \ + python -c 'import dynamo; print(\"dynamo OK\")' 2>&1 || echo 'dynamo import FAILED' && \ + python -c 'import tensorrt_llm; print(\"tensorrt_llm OK\")' 2>&1 || echo 'tensorrt_llm import FAILED' && \ + python -c 'import torch; print(\"torch OK\")' 2>&1 || echo 'torch import FAILED' && \ + echo '=== DEBUG: Running pytest with extra verbosity ===' && \ + pytest -vvv --collect-only --tb=short -m \"${{ inputs.pytest_marks }}\" 2>&1" else echo "๐Ÿš€ Running pytest in normal mode" PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\"" @@ -82,6 +126,10 @@ runs: chmod 777 "${TEST_RESULTS_DIR}" echo "๐Ÿ“ Test results will be saved to: ${TEST_RESULTS_DIR}" + echo "๐Ÿ”ง DEBUG: About to run pytest with command: ${PYTEST_CMD}" + echo "๐Ÿ”ง DEBUG: GPU_FLAGS: ${GPU_FLAGS}" + echo "๐Ÿ”ง DEBUG: Image tag: ${{ inputs.image_tag }}" + docker run ${GPU_FLAGS} --rm -w /workspace \ --cpus=${NUM_CPUS} \ --network host \ @@ -93,6 +141,15 @@ runs: TEST_EXIT_CODE=$? echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV echo "๐Ÿงช Tests completed with exit code: ${TEST_EXIT_CODE}" + + # DEBUG: Additional exit code interpretation + if [[ ${TEST_EXIT_CODE} -eq 127 ]]; then + echo "โŒ DEBUG: Exit code 127 = Command not found (pytest or bash missing from container)" + elif [[ ${TEST_EXIT_CODE} -eq 139 ]]; then + echo "โŒ DEBUG: Exit code 139 = Segmentation fault (SIGSEGV) - likely a crash in pytest or imported modules" + elif [[ ${TEST_EXIT_CODE} -eq 137 ]]; then + echo "โŒ DEBUG: Exit code 137 = OOM killed (SIGKILL)" + fi # Verify test results were written (only in normal mode) if [[ "${{ inputs.dry_run }}" != "true" ]]; then diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml index bee2426d1d..295f2da3a8 100644 --- a/.github/workflows/nightly-ci.yml +++ b/.github/workflows/nightly-ci.yml @@ -6,6 +6,14 @@ name: Nightly CI pipeline on: schedule: - cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC) + # TEMPORARY: Enable PR and manual triggers for debugging exit codes 127/139 + pull_request: + branches: + - main + paths: + - '.github/workflows/nightly-ci.yml' + - '.github/actions/pytest/**' + workflow_dispatch: # Allow manual triggering from GitHub UI permissions: contents: read @@ -28,7 +36,8 @@ jobs: strategy: fail-fast: false matrix: - framework: [vllm, trtllm, sglang] + # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64) + framework: [trtllm] env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com steps: @@ -122,21 +131,22 @@ jobs: fail-fast: false matrix: include: - - framework: vllm - base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04' - runtime_image_tag: '12.9.0-runtime-ubuntu24.04' - cuda_version: '129' - torch_backend: 'cu129' + # TEMPORARILY DISABLED - focusing on trtllm arm64 127 exit code debugging + # - framework: vllm + # base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04' + # runtime_image_tag: '12.9.0-runtime-ubuntu24.04' + # cuda_version: '129' + # torch_backend: 'cu129' - framework: trtllm base_image_tag: '25.06-py3' runtime_image_tag: '' cuda_version: '129' torch_backend: 'cu129' - - framework: sglang - base_image_tag: '' - runtime_image_tag: '' - cuda_version: '' - torch_backend: '' + # - framework: sglang + # base_image_tag: '' + # runtime_image_tag: '' + # cuda_version: '' + # torch_backend: '' env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com steps: @@ -233,7 +243,8 @@ jobs: strategy: fail-fast: false matrix: - framework: [vllm, trtllm, sglang] + # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64) + framework: [trtllm] arch: - arch: amd64 runner: gpu-l40-amd64 @@ -304,7 +315,8 @@ jobs: strategy: fail-fast: false matrix: - framework: [vllm, trtllm, sglang] + # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64) + framework: [trtllm] arch: - arch: amd64 runner: gpu-l40-amd64 @@ -369,7 +381,8 @@ jobs: strategy: fail-fast: false matrix: - framework: [vllm, trtllm, sglang] + # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64) + framework: [trtllm] arch: - arch: amd64 runner: gpu-l40-amd64 @@ -435,7 +448,8 @@ jobs: strategy: fail-fast: false matrix: - framework: [vllm, trtllm, sglang] + # TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64) + framework: [trtllm] arch: - arch: amd64 runner: gpu-l40-amd64