Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion .github/actions/pytest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,55 @@ runs:
# Run pytest with detailed output and JUnit XML
set +e # Don't exit on test failures

# DEBUG: Run diagnostic commands inside the container first
echo "=========================================="
echo "🔍 DEBUG: Running container diagnostics"
echo "=========================================="

echo "📋 Checking container environment..."
docker run --rm ${{ inputs.image_tag }} bash -c "
echo '--- Container Info ---'
uname -a
echo ''
echo '--- Python Location ---'
which python || echo 'python not found'
which python3 || echo 'python3 not found'
echo ''
echo '--- Python Version ---'
python --version 2>&1 || python3 --version 2>&1 || echo 'Python not available'
echo ''
echo '--- Pytest Location ---'
which pytest || echo 'pytest not found in PATH'
echo ''
echo '--- Pytest Version ---'
pytest --version 2>&1 || echo 'pytest command failed'
echo ''
echo '--- PATH ---'
echo \$PATH
echo ''
echo '--- /workspace contents ---'
ls -la /workspace 2>/dev/null || echo '/workspace does not exist or is empty'
echo ''
echo '--- pip list (pytest related) ---'
pip list 2>/dev/null | grep -i pytest || echo 'pip list failed or no pytest packages'
" || echo "DEBUG: Container diagnostic command failed with exit code: $?"

echo "=========================================="
echo "🔍 DEBUG: Diagnostics complete"
echo "=========================================="
Comment on lines +61 to +96
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Fix trailing whitespace on lines 93 and 97.

The pipeline failure indicates trailing whitespace in this section. Lines with empty diagnostic output headers need cleanup.

         echo "=========================================="
         echo "🔍 DEBUG: Running container diagnostics"
         echo "=========================================="
-        
+
         echo "📋 Checking container environment..."
         ...
         " || echo "DEBUG: Container diagnostic command failed with exit code: $?"
-        
+
         echo "=========================================="
🤖 Prompt for AI Agents
In .github/actions/pytest/action.yml around lines 61 to 96: there are trailing
spaces at the ends of the empty diagnostic header lines (reported as lines 93
and 97) — remove the trailing whitespace at those line ends so the blank header
lines contain no spaces/tabs, save the file, and re-run the pipeline (or run a
trim-eol/whitespace tidy) to ensure no other end-of-line trailing whitespace
remains.


# Determine docker runtime flags and pytest command based on dry_run mode
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
GPU_FLAGS=""
PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
# DEBUG: First try importing key test dependencies to catch import-time errors
# Then run pytest with extra verbosity to see collection details
PYTEST_CMD="echo '=== DEBUG: Testing key imports ===' && \
python -c 'import dynamo; print(\"dynamo OK\")' 2>&1 || echo 'dynamo import FAILED' && \
python -c 'import tensorrt_llm; print(\"tensorrt_llm OK\")' 2>&1 || echo 'tensorrt_llm import FAILED' && \
python -c 'import torch; print(\"torch OK\")' 2>&1 || echo 'torch import FAILED' && \
echo '=== DEBUG: Running pytest with extra verbosity ===' && \
pytest -vvv --collect-only --tb=short -m \"${{ inputs.pytest_marks }}\" 2>&1"
else
echo "🚀 Running pytest in normal mode"
PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
Expand All @@ -82,6 +126,10 @@ runs:
chmod 777 "${TEST_RESULTS_DIR}"
echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"

echo "🔧 DEBUG: About to run pytest with command: ${PYTEST_CMD}"
echo "🔧 DEBUG: GPU_FLAGS: ${GPU_FLAGS}"
echo "🔧 DEBUG: Image tag: ${{ inputs.image_tag }}"

docker run ${GPU_FLAGS} --rm -w /workspace \
--cpus=${NUM_CPUS} \
--network host \
Expand All @@ -93,6 +141,15 @@ runs:
TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"

# DEBUG: Additional exit code interpretation
if [[ ${TEST_EXIT_CODE} -eq 127 ]]; then
echo "❌ DEBUG: Exit code 127 = Command not found (pytest or bash missing from container)"
elif [[ ${TEST_EXIT_CODE} -eq 139 ]]; then
echo "❌ DEBUG: Exit code 139 = Segmentation fault (SIGSEGV) - likely a crash in pytest or imported modules"
elif [[ ${TEST_EXIT_CODE} -eq 137 ]]; then
echo "❌ DEBUG: Exit code 137 = OOM killed (SIGKILL)"
fi

# Verify test results were written (only in normal mode)
if [[ "${{ inputs.dry_run }}" != "true" ]]; then
Expand Down
44 changes: 29 additions & 15 deletions .github/workflows/nightly-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ name: Nightly CI pipeline
on:
schedule:
- cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
# TEMPORARY: Enable PR and manual triggers for debugging exit codes 127/139
pull_request:
branches:
- main
paths:
- '.github/workflows/nightly-ci.yml'
- '.github/actions/pytest/**'
workflow_dispatch: # Allow manual triggering from GitHub UI

permissions:
contents: read
Expand All @@ -28,7 +36,8 @@ jobs:
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
# TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
framework: [trtllm]
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
Expand Down Expand Up @@ -122,21 +131,22 @@ jobs:
fail-fast: false
matrix:
include:
- framework: vllm
base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04'
runtime_image_tag: '12.9.0-runtime-ubuntu24.04'
cuda_version: '129'
torch_backend: 'cu129'
# TEMPORARILY DISABLED - focusing on trtllm arm64 127 exit code debugging
# - framework: vllm
# base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04'
# runtime_image_tag: '12.9.0-runtime-ubuntu24.04'
# cuda_version: '129'
# torch_backend: 'cu129'
- framework: trtllm
base_image_tag: '25.06-py3'
runtime_image_tag: ''
cuda_version: '129'
torch_backend: 'cu129'
- framework: sglang
base_image_tag: ''
runtime_image_tag: ''
cuda_version: ''
torch_backend: ''
# - framework: sglang
# base_image_tag: ''
# runtime_image_tag: ''
# cuda_version: ''
# torch_backend: ''
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
Expand Down Expand Up @@ -233,7 +243,8 @@ jobs:
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
# TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
framework: [trtllm]
arch:
- arch: amd64
runner: gpu-l40-amd64
Expand Down Expand Up @@ -304,7 +315,8 @@ jobs:
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
# TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
framework: [trtllm]
arch:
- arch: amd64
runner: gpu-l40-amd64
Expand Down Expand Up @@ -369,7 +381,8 @@ jobs:
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
# TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
framework: [trtllm]
arch:
- arch: amd64
runner: gpu-l40-amd64
Expand Down Expand Up @@ -435,7 +448,8 @@ jobs:
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
# TEMPORARILY REDUCED - focusing on trtllm exit code debugging (127 arm64, 139 amd64)
framework: [trtllm]
arch:
- arch: amd64
runner: gpu-l40-amd64
Expand Down
Loading