Skip to content

tests: add test cases for lora agg and agg router #7920

tests: add test cases for lora agg and agg router

tests: add test cases for lora agg and agg router #7920

# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Docker Build and Test
on:
push:
branches:
- main
- "pull-request/[0-9]+"
- release/*.*.*
workflow_dispatch:
inputs:
run_deploy_operator:
description: 'Run deploy operator and deployment tests'
required: false
type: boolean
default: false
concurrency:
# The group name is a ternary operation. If the ref_name is 'main',
# then the group name uses the run_id to ensure a unique group for
# 'main' pushes. Otherwise, the group name is the ref_name, so that
# workflows on the same PR/branch have the same group name for cancelling.
group: docker-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
changed-files:
runs-on: ubuntu-latest
environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }}
outputs:
has_code_changes: ${{ steps.filter.outputs.has_code_changes }}
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Check for changes
uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
id: filter
with:
filters: .github/filters.yaml
backend-status-check:
runs-on: ubuntu-latest
needs: [vllm, sglang, trtllm, operator]
if: always()
steps:
- name: "Check all dependent jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
operator:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: cpu-amd-m5-2xlarge }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: operator (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver: docker
- name: Login to ECR
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Linter
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
cd deploy/cloud/operator
docker build --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
cd deploy/cloud/operator
docker build --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Set up Go
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
with:
go-version: '1.24'
- name: Check for uncommitted changes
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
sudo apt-get update && sudo apt-get install -y make
cd deploy/cloud/operator
make check
- name: Build Container
id: build-image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
cd deploy/cloud/operator
docker buildx build --load \
--platform linux/${{ matrix.platform.arch }} \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
-f Dockerfile \
-t dynamo-operator:latest .
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: dynamo-operator:latest
push_tags: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
vllm:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: vllm (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Build Container
id: build-image
uses: ./.github/actions/docker-build
with:
framework: vllm
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "pre_merge and vllm"
framework: "vllm"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
sglang:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: sglang (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Build Container
id: build-image
uses: ./.github/actions/docker-build
with:
framework: sglang
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "pre_merge and sglang"
framework: "sglang"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
trtllm:
needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: trtllm (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Build Container
id: build-image
uses: ./.github/actions/docker-build
with:
framework: trtllm
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "pre_merge and trtllm"
framework: "trtllm"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
deploy-test-fault-tolerance:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, operator, vllm, trtllm, sglang]
permissions:
contents: read
strategy:
fail-fast: false
# Run matrix jobs sequentially to prevent a Helm race condition
# Parallel jobs conflict on ClusterRole ownership when installing the chart.
# Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
max-parallel: 1
matrix:
framework:
- { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
- { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
- { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
name: deploy-test-fault-tolerance (${{ matrix.framework.name }})
env:
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Set namespace
run: |
# Set namespace using test scenario
export FRAMEWORK=${{ matrix.framework.name }}
echo "NAMESPACE=gh-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
- name: Deploy Operator
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl delete namespace $NAMESPACE || true
kubectl create namespace $NAMESPACE || true
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--timeout 10m --wait
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
cd -
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
- name: Run Fault Tolerance Tests
id: run-ft-tests
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
export NAMESPACE=$NAMESPACE
export FRAMEWORK=${{ matrix.framework.name }}
export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
echo "Using namespace: $NAMESPACE"
echo "Using image: $IMAGE"
# Install python3-venv package if not already installed
sudo apt-get update && sudo apt-get install -y python3-venv
# Set up Python virtual environment and install test dependencies
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r container/deps/requirements.test.txt
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
# Create test-results directory
mkdir -p test-results
# Run the pytest command with JUnit XML output
set +e # Don't exit on test failures
pytest tests/fault_tolerance/deploy/test_deployment.py \
-m 'k8s and fault_tolerance' \
-k '${{ matrix.framework.test_scenario }}' \
-s -v \
--namespace ${NAMESPACE} \
--image ${IMAGE} \
--client-type legacy \
--junitxml=test-results/pytest_ft_report.xml \
--tb=short
TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}
continue-on-error: true
- name: Process Fault Tolerance Test Results
if: always()
run: |
set -x
# Rename JUnit XML with unique naming if it exists
if [ -f "test-results/pytest_ft_report.xml" ]; then
mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml"
echo "✅ JUnit XML report renamed with unique identifier"
else
echo "⚠️ JUnit XML report not found"
fi
- name: Upload Fault Tolerance Test Results
uses: actions/upload-artifact@v4
if: always()
with:
name: test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7
- name: Cleanup
if: always()
timeout-minutes: 5
run: |
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
deploy-operator:
runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-operator job in CI.
#if: needs.changed-files.outputs.has_code_changes == 'true'
if: github.event.inputs.run_deploy_operator
needs: [changed-files, operator, vllm, sglang, trtllm]
env:
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
outputs:
NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- uses: actions/checkout@v4
- name: Deploy Operator
id: deploy-operator-step
env:
BRANCH: ${{ github.ref_name }}
run: |
set -x
# Set namespace
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl create namespace $NAMESPACE
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
deploy-test-vllm:
runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-test-vllm job in CI.
#if: needs.changed-files.outputs.has_code_changes == 'true'
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, vllm]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
MODEL_NAME: "Qwen/Qwen3-0.6B"
steps: &deploy-test-steps
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- uses: actions/checkout@v4
- name: Setup Kubeconfig
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config get-contexts
- name: Run Tests
id: run-tests
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
# Redirect all output to a log file while still showing it
exec > >(tee -a test-output.log) 2>&1
cd examples/backends/$FRAMEWORK
export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
export KUBE_NS=$NAMESPACE
export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
echo "GRAPH_NAME=${GRAPH_NAME}" >> $GITHUB_ENV
# Update the deployment file in-place
yq -i '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE
# Debug: Show updated deployment file
echo "=== UPDATED DEPLOYMENT FILE ==="
cat $DEPLOYMENT_FILE
# Apply the updated file
kubectl apply -n $KUBE_NS -f $DEPLOYMENT_FILE
# --- Wait for all pods in the dynamo graph deployment to be ready ---
sleep 20
# Get the deployment name from the file
export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
echo "Waiting for all pods with label nvidia.com/dynamo-graph-deployment-name: $GRAPH_NAME"
# Wait for all pods with the deployment label to be ready
kubectl wait --for=condition=ready pod -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${KUBE_NS} --timeout=1800s
# Debug: Show final pod statuses for the deployment
echo "=== FINAL POD STATUSES ==="
kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $KUBE_NS -o wide
echo ""
kubectl get all -n $KUBE_NS
export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} -l nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME} | tail -n1 | awk '{print $1}')
export CONTAINER_PORT=$(kubectl get pod $FRONTEND_POD -n ${KUBE_NS} -o jsonpath='{.spec.containers[0].ports[?(@.name=="http")].containerPort}')
echo "Container port is ${CONTAINER_PORT}"
kubectl port-forward pod/$FRONTEND_POD 8000:${CONTAINER_PORT} -n ${KUBE_NS} &
export LLM_URL="http://localhost:8000"
sleep 10 # Give port-forward time to establish the connection
echo "LLM URL: ${LLM_URL}"
echo "MODEL NAME: ${MODEL_NAME}"
# Wait until the model is available in the /v1/models response
MAX_ATTEMPTS=30
ATTEMPT=1
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
MODELS_RESPONSE=$(curl -s --retry 5 --retry-delay 2 --retry-connrefused "${LLM_URL}/v1/models")
if echo "$MODELS_RESPONSE" | jq -e --arg MODEL_NAME "$MODEL_NAME" '.data[]?.id == $MODEL_NAME' >/dev/null 2>&1; then
echo "Model $MODEL_NAME is available in /v1/models"
break
fi
echo "Waiting for model $MODEL_NAME to be available in /v1/models... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
sleep 5
ATTEMPT=$((ATTEMPT + 1))
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts"
echo "Last response: $MODELS_RESPONSE"
exit 1
fi
RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused -X POST "${LLM_URL}/v1/chat/completions" \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
"model": "'"${MODEL_NAME:-Qwen/Qwen3-0.6B}"'",
"messages": [
{
"role": "user",
"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
}
],
"stream":false,
"max_tokens": 30,
"temperature": 0.0
}' 2>&1)
echo "Response: $RESPONSE"
TEST_RESULT=0
if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then
echo "Test failed: Response is not valid JSON"
echo "Got: $RESPONSE"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
echo "Test failed: Message role is not 'assistant'"
echo "Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
echo "Test failed: Model name is incorrect"
echo "Got: $(echo "$RESPONSE" | jq '.model')"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null 2>&1; then
echo "Test failed: Response content length is not greater than 100 characters"
echo "Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
TEST_RESULT=1
else
echo "Test passed: Response matches expected format and content"
fi
exit $TEST_RESULT
continue-on-error: true
- name: Process Deployment Test Results
if: always()
run: |
set -x
# Create test-results directory
mkdir -p test-results
# Copy and rename the test output log with unique naming
if [ -f "test-output.log" ]; then
cp test-output.log "test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log"
echo "✅ Test output log copied to test-results/"
else
echo "⚠️ test-output.log not found"
fi
- name: Upload Deployment Test Results
uses: actions/upload-artifact@v4
if: always()
with:
name: test-results-${{ env.FRAMEWORK }}-deploy-${{ matrix.profile }}-amd64-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log
retention-days: 7
- name: Cleanup
if: always()
timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
# For debugging purposes, list all the resources before we delete
kubectl get all
echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..."
kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE || true
deploy-test-sglang:
runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-test-sglang job in CI.
#if: needs.changed-files.outputs.has_code_changes == 'true'
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, sglang]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- agg
- agg_router
name: deploy-test-sglang (${{ matrix.profile }})
env:
FRAMEWORK: sglang
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
MODEL_NAME: "Qwen/Qwen3-0.6B"
steps: *deploy-test-steps
deploy-test-trtllm:
runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-test-trtllm job in CI.
#if: needs.changed-files.outputs.has_code_changes == 'true'
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, trtllm]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-trtllm (${{ matrix.profile }})
env:
FRAMEWORK: trtllm
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
MODEL_NAME: "Qwen/Qwen3-0.6B"
steps: *deploy-test-steps
cleanup:
runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment the below if statement when we have a way to test the cleanup job in CI.
# if: always()
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- uses: actions/checkout@v4
- name: Setup Kubeconfig
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
- name: Cleanup
timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform --namespace $NAMESPACE || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."