Skip to content

Commit c18774b

Browse files
pvijayakrishfurionw
authored andcommitted
ci: Adding nightly pipeline workflow (#4204)
Signed-off-by: pvijayakrish <[email protected]> Signed-off-by: Pavithra Vijayakrishnan <[email protected]>
1 parent e187d94 commit c18774b

File tree

12 files changed

+939
-154
lines changed

12 files changed

+939
-154
lines changed

.github/actions/docker-build/action.yml

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ inputs:
4949
torch_backend:
5050
description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
5151
required: false
52+
enable_kvbm:
53+
description: 'Enable KVBM support (optional)'
54+
required: false
55+
dynamo_base_image:
56+
description: 'Pre-built Dynamo base image to use instead of building from scratch'
57+
required: false
5258

5359
outputs:
5460
image_tag:
@@ -72,14 +78,9 @@ runs:
7278
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
7379
- name: Login to NGC
7480
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
75-
shell: bash
76-
run: |
77-
echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
78-
- name: Cleanup
79-
if: always()
80-
shell: bash
81-
run: |
82-
docker system prune -af
81+
uses: ./.github/actions/docker-login
82+
with:
83+
ngc_ci_access_token: ${{ inputs.ngc_ci_access_token }}
8384
- name: Build image
8485
id: build
8586
shell: bash
@@ -125,6 +126,12 @@ runs:
125126
if [ -n "${{ inputs.torch_backend }}" ]; then
126127
EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
127128
fi
129+
if [ -n "${{ inputs.dynamo_base_image }}" ]; then
130+
EXTRA_ARGS+=" --dynamo-base-image ${{ inputs.dynamo_base_image }}"
131+
fi
132+
if [ -n "${{ inputs.enable_kvbm }}" ]; then
133+
EXTRA_ARGS+=" --build-arg ENABLE_KVBM=${{ inputs.enable_kvbm }}"
134+
fi
128135
129136
# Execute build and capture output (show on console AND save to file)
130137
./container/build.sh --tag "$IMAGE_TAG" \
@@ -289,7 +296,7 @@ runs:
289296
uses: actions/upload-artifact@v4
290297
if: always()
291298
with:
292-
name: build-metrics-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
299+
name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
293300
path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
294301
retention-days: 7
295302

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: 'Docker Login'
2+
description: 'Login to multiple container registries (ECR, NGC, ACR)'
3+
4+
inputs:
5+
ngc_ci_access_token:
6+
description: 'NGC CI Access Token'
7+
required: false
8+
aws_default_region:
9+
description: 'AWS Default Region'
10+
required: false
11+
aws_account_id:
12+
description: 'AWS Account ID'
13+
required: false
14+
azure_acr_hostname:
15+
description: 'Azure ACR hostname'
16+
required: false
17+
azure_acr_user:
18+
description: 'Azure ACR user'
19+
required: false
20+
azure_acr_password:
21+
description: 'Azure ACR password'
22+
required: false
23+
24+
runs:
25+
using: "composite"
26+
steps:
27+
- name: ECR Login
28+
shell: bash
29+
if: ${{ inputs.aws_default_region != '' && inputs.aws_account_id != '' }}
30+
env:
31+
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
32+
run: |
33+
set -euo pipefail
34+
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
35+
- name: NGC Login
36+
if: ${{ inputs.ngc_ci_access_token != '' }}
37+
shell: bash
38+
run: |
39+
set -euo pipefail
40+
echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
41+
- name: ACR Login
42+
shell: bash
43+
if: ${{ inputs.azure_acr_hostname != '' && inputs.azure_acr_user != '' && inputs.azure_acr_password != '' }}
44+
run: |
45+
set -euo pipefail
46+
echo "${{ inputs.azure_acr_password }}" | docker login "${{ inputs.azure_acr_hostname }}" --username "${{ inputs.azure_acr_user }}" --password-stdin

.github/actions/docker-tag-push/action.yml

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
name: 'Docker Tag and Push'
12
description: 'Tag and Push Docker Images'
23

34
inputs:
45
local_image:
56
description: 'Local Image Name:Tag'
67
required: true
7-
push_tag:
8-
description: 'Target Name:Tag'
8+
push_tags:
9+
description: 'Target Name:Tag (newline-separated list for multiple tags)'
910
required: true
1011
aws_push:
1112
description: 'Push to AWS Boolean'
@@ -38,37 +39,48 @@ inputs:
3839
required: false
3940

4041
outputs:
41-
image_tag:
42-
description: 'Image Tag'
43-
value: ${{ inputs.push_tag }}
42+
image_tags:
43+
description: 'Image Tags'
44+
value: ${{ inputs.push_tags }}
4445

4546
runs:
4647
using: "composite"
4748
steps:
4849
- name: Set up Docker Buildx
4950
uses: docker/setup-buildx-action@v3
50-
- name: ACR Login
51-
shell: bash
52-
if: ${{ inputs.azure_push == 'true' }}
53-
run: |
54-
echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
51+
5552
- name: ECR Tag and Push
5653
shell: bash
5754
if: ${{ inputs.aws_push == 'true' }}
5855
env:
5956
LOCAL_IMAGE: ${{ inputs.local_image }}
60-
PUSH_TAG: ${{ inputs.push_tag }}
57+
PUSH_TAGS: ${{ inputs.push_tags }}
6158
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
6259
run: |
63-
docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
64-
docker push ${ECR_HOSTNAME}/${PUSH_TAG}
60+
set -euo pipefail
61+
while IFS= read -r TAG; do
62+
if [ -z "$TAG" ]; then
63+
continue
64+
fi
65+
echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
66+
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
67+
docker push "${ECR_HOSTNAME}/${TAG}"
68+
done <<< "$PUSH_TAGS"
69+
6570
- name: ACR Tag and Push
6671
shell: bash
6772
if: ${{ inputs.azure_push == 'true' }}
6873
env:
6974
LOCAL_IMAGE: ${{ inputs.local_image }}
70-
PUSH_TAG: ${{ inputs.push_tag }}
75+
PUSH_TAGS: ${{ inputs.push_tags }}
7176
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
7277
run: |
73-
docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
74-
docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
78+
set -euo pipefail
79+
while IFS= read -r TAG; do
80+
if [ -z "$TAG" ]; then
81+
continue
82+
fi
83+
echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
84+
docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
85+
docker push "${AZURE_ACR_HOSTNAME}/${TAG}"
86+
done <<< "$PUSH_TAGS"

.github/actions/pytest/action.yml

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ inputs:
2424
description: 'Platform architecture (amd64, arm64)'
2525
required: false
2626
default: 'amd64'
27+
dry_run:
28+
description: 'Run pytest in dry-run mode (collect tests only, do not execute)'
29+
required: false
30+
default: 'false'
2731

2832

2933
runs:
@@ -54,21 +58,32 @@ runs:
5458
# Run pytest with detailed output and JUnit XML
5559
set +e # Don't exit on test failures
5660
57-
# Detect GPU availability and conditionally add GPU flags
58-
GPU_FLAGS=""
59-
if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
60-
echo "GPU detected, enabling GPU runtime"
61-
GPU_FLAGS="--runtime=nvidia --gpus all"
61+
# Determine docker runtime flags and pytest command based on dry_run mode
62+
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
63+
echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
64+
GPU_FLAGS=""
65+
PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
6266
else
63-
echo "No GPU detected, running in CPU-only mode"
67+
echo "🚀 Running pytest in normal mode"
68+
PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
69+
70+
# Detect GPU availability and conditionally add GPU flags
71+
GPU_FLAGS=""
72+
if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
73+
echo "✓ GPU detected, enabling GPU runtime"
74+
GPU_FLAGS="--runtime=nvidia --gpus all"
75+
else
76+
echo "⚠️ No GPU detected, running in CPU-only mode"
77+
fi
6478
fi
6579
66-
docker run ${GPU_FLAGS} --rm -w /workspace \
80+
# Run without --rm so we can copy results even if container crashes (example SIGSEGV exit 139)
81+
docker run ${GPU_FLAGS} -w /workspace \
6782
--cpus=${NUM_CPUS} \
6883
--network host \
6984
--name ${{ env.CONTAINER_ID }}_pytest \
7085
${{ inputs.image_tag }} \
71-
bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
86+
bash -c "mkdir -p /workspace/test-results && ${PYTEST_CMD}"
7287
7388
TEST_EXIT_CODE=$?
7489
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
@@ -92,6 +107,13 @@ runs:
92107
STR_TEST_TYPE=$(echo "${{ inputs.test_type }}" | tr ', ' '_')
93108
echo "STR_TEST_TYPE=${STR_TEST_TYPE}" >> $GITHUB_ENV
94109
110+
# Skip XML processing if in dry-run mode
111+
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
112+
echo "✅ Dry-run mode: Test collection completed"
113+
echo "⏭️ No JUnit XML generated (dry-run mode)"
114+
exit 0
115+
fi
116+
95117
# Check for JUnit XML file and determine test status
96118
JUNIT_FILE="test-results/pytest_test_report.xml"
97119
@@ -133,7 +155,7 @@ runs:
133155
134156
- name: Upload Test Results
135157
uses: actions/upload-artifact@v4
136-
if: always() # Always upload test results, even if tests failed
158+
if: always() && inputs.dry_run != 'true' # Skip upload in dry-run mode
137159
with:
138160
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
139161
path: |

.github/workflows/container-validation-backends.yml

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,10 @@ jobs:
7272
with:
7373
driver: docker
7474
- name: Login to ECR
75-
shell: bash
76-
env:
77-
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
78-
run: |
79-
aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
75+
uses: ./.github/actions/docker-login
76+
with:
77+
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
78+
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
8079
- name: Linter
8180
shell: bash
8281
env:
@@ -120,7 +119,7 @@ jobs:
120119
uses: ./.github/actions/docker-tag-push
121120
with:
122121
local_image: dynamo-operator:latest
123-
push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
122+
push_tags: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
124123
aws_push: 'false'
125124
azure_push: 'true'
126125
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
@@ -165,11 +164,18 @@ jobs:
165164
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
166165
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
167166
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
167+
- name: Login to Container Registries
168+
uses: ./.github/actions/docker-login
169+
with:
170+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
171+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
172+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
173+
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
168174
- name: Docker Tag and Push
169175
uses: ./.github/actions/docker-tag-push
170176
with:
171177
local_image: ${{ steps.build-image.outputs.image_tag }}
172-
push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
178+
push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
173179
# OPS-1145: Switch aws_push to true
174180
aws_push: 'false'
175181
azure_push: 'true'
@@ -223,11 +229,18 @@ jobs:
223229
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
224230
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
225231

232+
- name: Login to Container Registries
233+
uses: ./.github/actions/docker-login
234+
with:
235+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
236+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
237+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
238+
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
226239
- name: Docker Tag and Push
227240
uses: ./.github/actions/docker-tag-push
228241
with:
229242
local_image: ${{ steps.build-image.outputs.image_tag }}
230-
push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
243+
push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
231244
# OPS-1145: Switch aws_push to true
232245
aws_push: 'false'
233246
azure_push: 'true'
@@ -281,11 +294,18 @@ jobs:
281294
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
282295
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
283296

297+
- name: Login to Container Registries
298+
uses: ./.github/actions/docker-login
299+
with:
300+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
301+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
302+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
303+
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
284304
- name: Docker Tag and Push
285305
uses: ./.github/actions/docker-tag-push
286306
with:
287307
local_image: ${{ steps.build-image.outputs.image_tag }}
288-
push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
308+
push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
289309
# OPS-1145: Switch aws_push to true
290310
aws_push: 'false'
291311
azure_push: 'true'

.github/workflows/container-validation-dynamo.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ jobs:
3333
uses: docker/setup-buildx-action@v3
3434
- name: Login to NGC
3535
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
36-
run: |
37-
echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
36+
uses: ./.github/actions/docker-login
37+
with:
38+
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
3839
- name: Define Image Tag
3940
id: define_image_tag
4041
run: |

0 commit comments

Comments
 (0)