Skip to content

Commit 89ef073

Browse files
authored
Merge branch 'main' into tzulingk/parse
2 parents 4d52397 + 8f0ac73 commit 89ef073

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+3025
-462
lines changed

.github/actions/docker-build/action.yml

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ inputs:
4949
torch_backend:
5050
description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
5151
required: false
52+
enable_kvbm:
53+
description: 'Enable KVBM support (optional)'
54+
required: false
55+
dynamo_base_image:
56+
description: 'Pre-built Dynamo base image to use instead of building from scratch'
57+
required: false
5258

5359
outputs:
5460
image_tag:
@@ -72,14 +78,9 @@ runs:
7278
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
7379
- name: Login to NGC
7480
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
75-
shell: bash
76-
run: |
77-
echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
78-
- name: Cleanup
79-
if: always()
80-
shell: bash
81-
run: |
82-
docker system prune -af
81+
uses: ./.github/actions/docker-login
82+
with:
83+
ngc_ci_access_token: ${{ inputs.ngc_ci_access_token }}
8384
- name: Build image
8485
id: build
8586
shell: bash
@@ -125,6 +126,12 @@ runs:
125126
if [ -n "${{ inputs.torch_backend }}" ]; then
126127
EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
127128
fi
129+
if [ -n "${{ inputs.dynamo_base_image }}" ]; then
130+
EXTRA_ARGS+=" --dynamo-base-image ${{ inputs.dynamo_base_image }}"
131+
fi
132+
if [ -n "${{ inputs.enable_kvbm }}" ]; then
133+
EXTRA_ARGS+=" --build-arg ENABLE_KVBM=${{ inputs.enable_kvbm }}"
134+
fi
128135
129136
# Execute build and capture output (show on console AND save to file)
130137
./container/build.sh --tag "$IMAGE_TAG" \
@@ -144,6 +151,26 @@ runs:
144151
# Exit with the build's exit code
145152
exit ${BUILD_EXIT_CODE}
146153
154+
- name: Run Sanity Check on Runtime Image
155+
if: inputs.target == 'runtime'
156+
shell: bash
157+
run: |
158+
IMAGE_TAG="${{ steps.build.outputs.image_tag }}"
159+
echo "Running sanity check on image: $IMAGE_TAG"
160+
161+
# Run the sanity check script inside the container
162+
# The script is located in /workspace/deploy/sanity_check.py in runtime containers
163+
set +e
164+
docker run --rm "$IMAGE_TAG" python /workspace/deploy/sanity_check.py --runtime-check --no-gpu-check
165+
SANITY_CHECK_EXIT_CODE=$?
166+
set -e
167+
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
168+
echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
169+
exit ${SANITY_CHECK_EXIT_CODE}
170+
else
171+
echo "✅ Sanity check passed"
172+
fi
173+
147174
- name: Capture Build Metrics
148175
id: metrics
149176
shell: bash
@@ -289,7 +316,7 @@ runs:
289316
uses: actions/upload-artifact@v4
290317
if: always()
291318
with:
292-
name: build-metrics-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
319+
name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
293320
path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
294321
retention-days: 7
295322

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: 'Docker Login'
2+
description: 'Login to multiple container registries (ECR, NGC, ACR)'
3+
4+
inputs:
5+
ngc_ci_access_token:
6+
description: 'NGC CI Access Token'
7+
required: false
8+
aws_default_region:
9+
description: 'AWS Default Region'
10+
required: false
11+
aws_account_id:
12+
description: 'AWS Account ID'
13+
required: false
14+
azure_acr_hostname:
15+
description: 'Azure ACR hostname'
16+
required: false
17+
azure_acr_user:
18+
description: 'Azure ACR user'
19+
required: false
20+
azure_acr_password:
21+
description: 'Azure ACR password'
22+
required: false
23+
24+
runs:
25+
using: "composite"
26+
steps:
27+
- name: ECR Login
28+
shell: bash
29+
if: ${{ inputs.aws_default_region != '' && inputs.aws_account_id != '' }}
30+
env:
31+
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
32+
run: |
33+
set -euo pipefail
34+
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
35+
- name: NGC Login
36+
if: ${{ inputs.ngc_ci_access_token != '' }}
37+
shell: bash
38+
run: |
39+
set -euo pipefail
40+
echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
41+
- name: ACR Login
42+
shell: bash
43+
if: ${{ inputs.azure_acr_hostname != '' && inputs.azure_acr_user != '' && inputs.azure_acr_password != '' }}
44+
run: |
45+
set -euo pipefail
46+
echo "${{ inputs.azure_acr_password }}" | docker login "${{ inputs.azure_acr_hostname }}" --username "${{ inputs.azure_acr_user }}" --password-stdin

.github/actions/docker-tag-push/action.yml

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
name: 'Docker Tag and Push'
12
description: 'Tag and Push Docker Images'
23

34
inputs:
45
local_image:
56
description: 'Local Image Name:Tag'
67
required: true
7-
push_tag:
8-
description: 'Target Name:Tag'
8+
push_tags:
9+
description: 'Target Name:Tag (newline-separated list for multiple tags)'
910
required: true
1011
aws_push:
1112
description: 'Push to AWS Boolean'
@@ -38,37 +39,48 @@ inputs:
3839
required: false
3940

4041
outputs:
41-
image_tag:
42-
description: 'Image Tag'
43-
value: ${{ inputs.push_tag }}
42+
image_tags:
43+
description: 'Image Tags'
44+
value: ${{ inputs.push_tags }}
4445

4546
runs:
4647
using: "composite"
4748
steps:
4849
- name: Set up Docker Buildx
4950
uses: docker/setup-buildx-action@v3
50-
- name: ACR Login
51-
shell: bash
52-
if: ${{ inputs.azure_push == 'true' }}
53-
run: |
54-
echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
51+
5552
- name: ECR Tag and Push
5653
shell: bash
5754
if: ${{ inputs.aws_push == 'true' }}
5855
env:
5956
LOCAL_IMAGE: ${{ inputs.local_image }}
60-
PUSH_TAG: ${{ inputs.push_tag }}
57+
PUSH_TAGS: ${{ inputs.push_tags }}
6158
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
6259
run: |
63-
docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
64-
docker push ${ECR_HOSTNAME}/${PUSH_TAG}
60+
set -euo pipefail
61+
while IFS= read -r TAG; do
62+
if [ -z "$TAG" ]; then
63+
continue
64+
fi
65+
echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
66+
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
67+
docker push "${ECR_HOSTNAME}/${TAG}"
68+
done <<< "$PUSH_TAGS"
69+
6570
- name: ACR Tag and Push
6671
shell: bash
6772
if: ${{ inputs.azure_push == 'true' }}
6873
env:
6974
LOCAL_IMAGE: ${{ inputs.local_image }}
70-
PUSH_TAG: ${{ inputs.push_tag }}
75+
PUSH_TAGS: ${{ inputs.push_tags }}
7176
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
7277
run: |
73-
docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
74-
docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
78+
set -euo pipefail
79+
while IFS= read -r TAG; do
80+
if [ -z "$TAG" ]; then
81+
continue
82+
fi
83+
echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
84+
docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
85+
docker push "${AZURE_ACR_HOSTNAME}/${TAG}"
86+
done <<< "$PUSH_TAGS"

.github/actions/pytest/action.yml

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ inputs:
2424
description: 'Platform architecture (amd64, arm64)'
2525
required: false
2626
default: 'amd64'
27+
dry_run:
28+
description: 'Run pytest in dry-run mode (collect tests only, do not execute)'
29+
required: false
30+
default: 'false'
2731

2832

2933
runs:
@@ -54,31 +58,50 @@ runs:
5458
# Run pytest with detailed output and JUnit XML
5559
set +e # Don't exit on test failures
5660
57-
# Detect GPU availability and conditionally add GPU flags
58-
GPU_FLAGS=""
59-
if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
60-
echo "GPU detected, enabling GPU runtime"
61-
GPU_FLAGS="--runtime=nvidia --gpus all"
61+
# Determine docker runtime flags and pytest command based on dry_run mode
62+
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
63+
echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
64+
GPU_FLAGS=""
65+
PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
6266
else
63-
echo "No GPU detected, running in CPU-only mode"
67+
echo "🚀 Running pytest in normal mode"
68+
PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
69+
70+
# Detect GPU availability and conditionally add GPU flags
71+
GPU_FLAGS=""
72+
if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
73+
echo "✓ GPU detected, enabling GPU runtime"
74+
GPU_FLAGS="--runtime=nvidia --gpus all"
75+
else
76+
echo "⚠️ No GPU detected, running in CPU-only mode"
77+
fi
6478
fi
6579
80+
# Get absolute path for test-results directory and ensure it has proper permissions
81+
TEST_RESULTS_DIR="$(pwd)/test-results"
82+
chmod 777 "${TEST_RESULTS_DIR}"
83+
echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"
84+
6685
docker run ${GPU_FLAGS} --rm -w /workspace \
6786
--cpus=${NUM_CPUS} \
6887
--network host \
6988
--name ${{ env.CONTAINER_ID }}_pytest \
89+
-v "${TEST_RESULTS_DIR}:/workspace/test-results" \
7090
${{ inputs.image_tag }} \
71-
bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
91+
bash -c "${PYTEST_CMD}"
7292
7393
TEST_EXIT_CODE=$?
7494
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
7595
echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
7696
77-
# Copy test results from container to host
78-
docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/test-results . || echo "Failed to copy test results"
79-
80-
# Clean up container
81-
docker rm -f ${{ env.CONTAINER_ID }}_pytest || echo "Failed to clean up container"
97+
# Verify test results were written (only in normal mode)
98+
if [[ "${{ inputs.dry_run }}" != "true" ]]; then
99+
if [[ -f "${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}" ]]; then
100+
echo "✅ Test results file found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
101+
else
102+
echo "⚠️ Test results file not found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
103+
fi
104+
fi
82105
83106
# Always continue to results processing
84107
exit 0
@@ -103,23 +126,9 @@ runs:
103126
ERROR_TESTS=$(grep -o 'errors="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
104127
echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)"
105128
106-
# Create uniquely named metadata file with step context information
107-
# Use framework-testtype-arch to make it unique per test run
108-
METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.json"
109-
JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.xml"
110-
111129
# Rename XML file to unique name
130+
JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml"
112131
mv "$JUNIT_FILE" "test-results/$JUNIT_NAME"
113-
114-
echo '{' > "$METADATA_FILE"
115-
echo ' "job_name": "${{ github.job }}",' >> "$METADATA_FILE"
116-
echo ' "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE"
117-
echo ' "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE"
118-
echo ' "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE"
119-
echo ' "junit_xml_file": "'"$JUNIT_NAME"'",' >> "$METADATA_FILE"
120-
echo ' "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE"
121-
echo '}' >> "$METADATA_FILE"
122-
echo "📝 Created test metadata file: $METADATA_FILE"
123132
echo "📝 Renamed XML file to: $JUNIT_NAME"
124133
else
125134
echo "⚠️ JUnit XML file not found - test results may not be available for upload"
@@ -135,8 +144,6 @@ runs:
135144
uses: actions/upload-artifact@v4
136145
if: always() # Always upload test results, even if tests failed
137146
with:
138-
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
139-
path: |
140-
test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
141-
test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
147+
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
148+
path: test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
142149
retention-days: 7

0 commit comments

Comments
 (0)