Merge branch 'main' into fix/sglang-multimodal-worker-registration #7327

Workflow file for this run

.github/workflows/container-validation-backends.yml at 7c21500

	# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0

	name: Docker Build and Test

	on:
	push:
	branches:
	- main
	- "pull-request/[0-9]+"
	- release/..*
	workflow_dispatch:
	inputs:
	run_deploy_operator:
	description: 'Run deploy operator and deployment tests'
	required: false
	type: boolean
	default: false

	concurrency:
	# The group name is a ternary operation. If the ref_name is 'main',
	# then the group name uses the run_id to ensure a unique group for
	# 'main' pushes. Otherwise, the group name is the ref_name, so that
	# workflows on the same PR/branch have the same group name for cancelling.
	group: docker-build-test-${{ github.ref_name == 'main' && github.run_id \|\| github.ref_name }}
	cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

	jobs:
	changed-files:
	runs-on: ubuntu-latest
	environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' \|\| '' }}
	outputs:
	has_code_changes: ${{ steps.filter.outputs.has_code_changes }}
	steps:
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Check for changes
	uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2
	id: filter
	with:
	filters: .github/filters.yaml

	backend-status-check:
	runs-on: ubuntu-latest
	needs: [vllm, sglang, trtllm, operator]
	if: always()
	steps:
	- name: "Check all dependent jobs"
	run: \|
	echo '${{ toJson(needs) }}' \| jq -e 'to_entries \| map(.value.result) \| all(. as $result \| ["success", "skipped"] \| any($result == .))'

	operator:
	needs: changed-files
	if: needs.changed-files.outputs.has_code_changes == 'true'
	strategy:
	fail-fast: false
	matrix:
	platform:
	- { arch: amd64, runner: cpu-amd-m5-2xlarge }
	- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
	name: operator (${{ matrix.platform.arch }})
	runs-on: ${{ matrix.platform.runner }}
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3
	with:
	driver: docker
	- name: Login to ECR
	shell: bash
	env:
	ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	run: \|
	aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} \| docker login --username AWS --password-stdin ${ECR_HOSTNAME}
	- name: Linter
	shell: bash
	env:
	ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	run: \|
	cd deploy/cloud/operator
	docker build --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
	- name: Tester
	shell: bash
	env:
	ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	run: \|
	cd deploy/cloud/operator
	docker build --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .

	- name: Set up Go
	uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
	with:
	go-version: '1.24'
	- name: Check for uncommitted changes
	shell: bash
	env:
	ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	run: \|
	sudo apt-get update && sudo apt-get install -y make
	cd deploy/cloud/operator
	make check
	- name: Build Container
	id: build-image
	shell: bash
	env:
	ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
	run: \|
	cd deploy/cloud/operator
	docker buildx build --load \
	--platform linux/${{ matrix.platform.arch }} \
	--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
	-f Dockerfile \
	-t dynamo-operator:latest .
	- name: Docker Tag and Push
	uses: ./.github/actions/docker-tag-push
	with:
	local_image: dynamo-operator:latest
	push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
	aws_push: 'false'
	azure_push: 'true'
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

	vllm:
	needs: changed-files
	if: needs.changed-files.outputs.has_code_changes == 'true'
	strategy:
	fail-fast: false
	matrix:
	platform:
	- { arch: amd64, runner: gpu-l40-amd64 }
	- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
	name: vllm (${{ matrix.platform.arch }})
	runs-on: ${{ matrix.platform.runner }}
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Build Container
	id: build-image
	uses: ./.github/actions/docker-build
	with:
	framework: vllm
	target: runtime
	platform: 'linux/${{ matrix.platform.arch }}'
	base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' \|\| '' }}
	runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' \|\| '' }}
	cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' \|\| '' }}
	torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' \|\| '' }}
	ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
	ci_token: ${{ secrets.CI_TOKEN }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	- name: Docker Tag and Push
	uses: ./.github/actions/docker-tag-push
	with:
	local_image: ${{ steps.build-image.outputs.image_tag }}
	push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
	# OPS-1145: Switch aws_push to true
	aws_push: 'false'
	azure_push: 'true'
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

	- name: Run unit tests
	if: ${{ matrix.platform.arch != 'arm64' }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.build-image.outputs.image_tag }}
	pytest_marks: "unit and vllm and gpu_1"
	framework: "vllm"
	test_type: "unit"
	platform_arch: ${{ matrix.platform.arch }}
	- name: Run e2e tests
	if: ${{ matrix.platform.arch != 'arm64' }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.build-image.outputs.image_tag }}
	pytest_marks: "e2e and vllm and gpu_1 and not slow"
	framework: "vllm"
	test_type: "e2e, gpu_1"
	platform_arch: ${{ matrix.platform.arch }}

	sglang:
	needs: changed-files
	if: needs.changed-files.outputs.has_code_changes == 'true'
	strategy:
	fail-fast: false
	matrix:
	platform:
	- { arch: amd64, runner: gpu-l40-amd64 }
	- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
	name: sglang (${{ matrix.platform.arch }})
	runs-on: ${{ matrix.platform.runner }}
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- name: Checkout repository
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0

	- name: Build Container
	id: build-image
	uses: ./.github/actions/docker-build
	with:
	framework: sglang
	target: runtime
	platform: 'linux/${{ matrix.platform.arch }}'
	ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
	ci_token: ${{ secrets.CI_TOKEN }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

	- name: Docker Tag and Push
	uses: ./.github/actions/docker-tag-push
	with:
	local_image: ${{ steps.build-image.outputs.image_tag }}
	push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
	# OPS-1145: Switch aws_push to true
	aws_push: 'false'
	azure_push: 'true'
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

	- name: Run unit tests
	if: ${{ matrix.platform.arch != 'arm64' }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.build-image.outputs.image_tag }}
	pytest_marks: "unit and sglang and gpu_1"
	framework: "sglang"
	test_type: "unit"
	platform_arch: ${{ matrix.platform.arch }}
	- name: Run e2e tests
	if: ${{ matrix.platform.arch != 'arm64' }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.build-image.outputs.image_tag }}
	pytest_marks: "e2e and sglang and gpu_1"
	framework: "sglang"
	test_type: "e2e, gpu_1"
	platform_arch: ${{ matrix.platform.arch }}

	trtllm:
	needs: changed-files
	if: needs.changed-files.outputs.has_code_changes == 'true'
	strategy:
	fail-fast: false
	matrix:
	platform:
	- { arch: amd64, runner: gpu-l40-amd64 }
	- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
	name: trtllm (${{ matrix.platform.arch }})
	runs-on: ${{ matrix.platform.runner }}
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0

	- name: Build Container
	id: build-image
	uses: ./.github/actions/docker-build
	with:
	framework: trtllm
	target: runtime
	platform: 'linux/${{ matrix.platform.arch }}'
	ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
	ci_token: ${{ secrets.CI_TOKEN }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

	- name: Docker Tag and Push
	uses: ./.github/actions/docker-tag-push
	with:
	local_image: ${{ steps.build-image.outputs.image_tag }}
	push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
	# OPS-1145: Switch aws_push to true
	aws_push: 'false'
	azure_push: 'true'
	aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
	aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
	azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
	azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
	azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

	- name: Run unit tests
	if: ${{ matrix.platform.arch != 'arm64' }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.build-image.outputs.image_tag }}
	pytest_marks: "unit and trtllm and gpu_1"
	framework: "trtllm"
	test_type: "unit"
	platform_arch: ${{ matrix.platform.arch }}
	- name: Run e2e tests
	if: ${{ matrix.platform.arch != 'arm64' }}
	uses: ./.github/actions/pytest
	with:
	image_tag: ${{ steps.build-image.outputs.image_tag }}
	pytest_marks: "e2e and trtllm and gpu_1 and not slow"
	framework: "trtllm"
	test_type: "e2e, gpu_1"
	platform_arch: ${{ matrix.platform.arch }}

	deploy-test-fault-tolerance:
	runs-on: cpu-amd-m5-2xlarge
	if: needs.changed-files.outputs.has_code_changes == 'true'
	needs: [changed-files, operator, vllm, trtllm, sglang]
	permissions:
	contents: read
	strategy:
	fail-fast: false
	# Run matrix jobs sequentially to prevent a Helm race condition
	# Parallel jobs conflict on ClusterRole ownership when installing the chart.
	# Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
	max-parallel: 1
	matrix:
	framework:
	- { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
	- { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
	- { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
	name: deploy-test-fault-tolerance (${{ matrix.framework.name }})
	env:
	DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- name: Checkout code
	uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
	- name: Set namespace
	run: \|
	# Set namespace using test scenario
	export FRAMEWORK=${{ matrix.framework.name }}
	echo "NAMESPACE=gh-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
	set -x

	# Setup kubeconfig
	echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" \| base64 -d > .kubeconfig
	chmod 600 .kubeconfig
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
	kubectl config current-context
	- name: Deploy Operator
	run: \|
	set -x
	export KUBECONFIG=$(pwd)/.kubeconfig

	# Create a namespace for this job
	echo "Creating an ephemeral namespace..."
	kubectl delete namespace $NAMESPACE \|\| true
	kubectl create namespace $NAMESPACE \|\| true
	echo "Attaching the labels for secrets and cleanup"
	kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true

	# Set the namespace as default
	kubectl config set-context --current --namespace=$NAMESPACE

	# Check if Istio is installed
	kubectl get pods -n istio-system
	# Check if default storage class exists
	kubectl get storageclass

	# Install Helm chart
	export VIRTUAL_ENV=/opt/dynamo/venv
	export KUBE_NS=$NAMESPACE
	export ISTIO_ENABLED=true
	export ISTIO_GATEWAY=istio-system/ingress-alb
	export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
	export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}

	# Install dynamo env secrets
	kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS \|\| true
	# Create docker pull secret for operator image
	kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
	# Install helm dependencies
	helm repo add bitnami https://charts.bitnami.com/bitnami
	cd deploy/cloud/helm/platform/
	helm dep build .
	# Install platform with namespace restriction for single profile testing
	helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
	--set dynamo-operator.namespaceRestriction.enabled=true \
	--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
	--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
	--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
	--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
	--timeout 10m --wait
	# Wait for all deployments to be ready
	timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
	cd -

	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE
	- name: Run Fault Tolerance Tests
	run: \|
	set -x
	export KUBECONFIG=$(pwd)/.kubeconfig
	export NAMESPACE=$NAMESPACE
	export FRAMEWORK=${{ matrix.framework.name }}
	export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"

	echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
	echo "Using namespace: $NAMESPACE"
	echo "Using image: $IMAGE"

	# Install python3-venv package if not already installed
	sudo apt-get update && sudo apt-get install -y python3-venv

	# Set up Python virtual environment and install test dependencies
	python3 -m venv venv
	source venv/bin/activate
	pip install --upgrade pip
	pip install -r container/deps/requirements.test.txt
	pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic

	# Run the pytest command (tests orchestrate K8s, don't need dynamo package)
	pytest tests/fault_tolerance/deploy/test_deployment.py \
	-m 'k8s and fault_tolerance' \
	-k '${{ matrix.framework.test_scenario }}' \
	-s -v \
	--namespace ${NAMESPACE} \
	--image ${IMAGE} \
	--client-type legacy
	- name: Cleanup
	if: always()
	timeout-minutes: 5
	run: \|
	echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" \| base64 -d > .kubeconfig
	chmod 600 .kubeconfig
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"

	# For debugging purposes, list all the resources before we uninstall
	kubectl get all

	echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
	kubectl delete dynamographdeployments --all -n $NAMESPACE \|\| true

	# Uninstall the helm chart
	helm ls
	helm uninstall dynamo-platform \|\| true

	echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
	kubectl delete namespace $NAMESPACE \|\| true
	echo "Namespace $NAMESPACE completed."

	# Upload metrics for this workflow and all its jobs
	upload-workflow-metrics:
	name: Upload Workflow Metrics
	runs-on: gitlab
	if: always() # Always run, even if other jobs fail
	needs: [backend-status-check] # Wait for the status check which waits for all build jobs

	steps:
	- name: Check out repository
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: '3.x'

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install requests

	- name: Download build metrics
	uses: actions/download-artifact@v4
	with:
	pattern: build-metrics-*
	path: build-metrics/
	merge-multiple: true
	continue-on-error: true # Don't fail if artifacts don't exist

	- name: Download test results
	uses: actions/download-artifact@v4
	with:
	pattern: test-results-*
	path: test-results/
	merge-multiple: true
	continue-on-error: true # Don't fail if artifacts don't exist

	- name: Upload Complete Workflow Metrics
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	WORKFLOW_INDEX: ${{ secrets.WORKFLOW_INDEX }}
	JOB_INDEX: ${{ secrets.JOB_INDEX }}
	STEPS_INDEX: ${{ secrets.STEPS_INDEX }}
	# Container and test index configuration
	CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
	TEST_INDEX: ${{ secrets.TEST_INDEX }}
	run: \|
	# Upload complete workflow metrics including container metrics
	python3 .github/workflows/upload_complete_workflow_metrics.py

	deploy-operator:
	runs-on: cpu-amd-m5-2xlarge
	# TODO: Uncomment this when we have a way to test the deploy-operator job in CI.
	#if: needs.changed-files.outputs.has_code_changes == 'true'
	if: github.event.inputs.run_deploy_operator
	needs: [changed-files, operator, vllm, sglang, trtllm]
	env:
	DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
	outputs:
	NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }}
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- uses: actions/checkout@v4
	- name: Deploy Operator
	id: deploy-operator-step
	env:
	BRANCH: ${{ github.ref_name }}
	run: \|
	set -x

	# Set namespace
	# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
	BRANCH_SANITIZED="${BRANCH//\//-}"
	BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
	BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
	NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
	echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"

	# Setup kubeconfig
	echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" \| base64 -d > .kubeconfig
	chmod 600 .kubeconfig
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
	kubectl config current-context

	# Create a namespace for this job
	echo "Creating an ephemeral namespace..."
	kubectl create namespace $NAMESPACE
	echo "Attaching the labels for secrets and cleanup"
	kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true

	# Set the namespace as default
	kubectl config set-context --current --namespace=$NAMESPACE

	# Check if Istio is installed
	kubectl get pods -n istio-system
	# Check if default storage class exists
	kubectl get storageclass

	# Install Helm chart
	export VIRTUAL_ENV=/opt/dynamo/venv
	export KUBE_NS=$NAMESPACE
	export ISTIO_ENABLED=true
	export ISTIO_GATEWAY=istio-system/ingress-alb
	export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
	export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}

	# Install dynamo env secrets
	kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS \|\| true
	# Create docker pull secret for operator image
	kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
	# Install helm dependencies
	helm repo add bitnami https://charts.bitnami.com/bitnami
	cd deploy/cloud/helm/platform/
	helm dep build .
	# Install platform with namespace restriction for single profile testing
	helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
	--set dynamo-operator.namespaceRestriction.enabled=true \
	--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
	--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
	--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
	--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
	# Wait for all deployments to be ready
	timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch

	deploy-test-vllm:
	runs-on: cpu-amd-m5-2xlarge
	# TODO: Uncomment this when we have a way to test the deploy-test-vllm job in CI.
	#if: needs.changed-files.outputs.has_code_changes == 'true'
	if: github.event.inputs.run_deploy_operator
	needs: [changed-files, deploy-operator, vllm]
	permissions:
	contents: read
	strategy:
	fail-fast: false
	max-parallel: 1
	matrix:
	profile:
	- agg
	- agg_router
	- disagg
	- disagg_router
	name: deploy-test-vllm (${{ matrix.profile }})
	env:
	FRAMEWORK: vllm
	DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
	DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
	MODEL_NAME: "Qwen/Qwen3-0.6B"
	steps: &deploy-test-steps
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- uses: actions/checkout@v4
	- name: Setup Kubeconfig
	env:
	NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	run: \|
	set -x
	# Setup kubeconfig
	echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" \| base64 -d > .kubeconfig
	chmod 600 .kubeconfig
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
	kubectl config get-contexts
	- name: Run Tests
	env:
	NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	run: \|
	set -x
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE

	cd examples/backends/$FRAMEWORK
	export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
	export KUBE_NS=$NAMESPACE
	export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
	echo "GRAPH_NAME=${GRAPH_NAME}" >> $GITHUB_ENV
	# Update the deployment file in-place
	yq -i '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE

	# Debug: Show updated deployment file
	echo "=== UPDATED DEPLOYMENT FILE ==="
	cat $DEPLOYMENT_FILE

	# Apply the updated file
	kubectl apply -n $KUBE_NS -f $DEPLOYMENT_FILE

	# --- Wait for all pods in the dynamo graph deployment to be ready ---
	sleep 20
	# Get the deployment name from the file
	export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
	echo "Waiting for all pods with label nvidia.com/dynamo-graph-deployment-name: $GRAPH_NAME"
	# Wait for all pods with the deployment label to be ready
	kubectl wait --for=condition=ready pod -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${KUBE_NS} --timeout=1800s

	# Debug: Show final pod statuses for the deployment
	echo "=== FINAL POD STATUSES ==="
	kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $KUBE_NS -o wide
	echo ""

	kubectl get all -n $KUBE_NS
	export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} -l nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME} \| tail -n1 \| awk '{print $1}')
	export CONTAINER_PORT=$(kubectl get pod $FRONTEND_POD -n ${KUBE_NS} -o jsonpath='{.spec.containers[0].ports[?(@.name=="http")].containerPort}')
	echo "Container port is ${CONTAINER_PORT}"
	kubectl port-forward pod/$FRONTEND_POD 8000:${CONTAINER_PORT} -n ${KUBE_NS} &
	export LLM_URL="http://localhost:8000"
	sleep 10 # Give port-forward time to establish the connection
	echo "LLM URL: ${LLM_URL}"
	echo "MODEL NAME: ${MODEL_NAME}"
	# Wait until the model is available in the /v1/models response
	MAX_ATTEMPTS=30
	ATTEMPT=1
	while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
	MODELS_RESPONSE=$(curl -s --retry 5 --retry-delay 2 --retry-connrefused "${LLM_URL}/v1/models")
	if echo "$MODELS_RESPONSE" \| jq -e --arg MODEL_NAME "$MODEL_NAME" '.data[]?.id == $MODEL_NAME' >/dev/null 2>&1; then
	echo "Model $MODEL_NAME is available in /v1/models"
	break
	fi
	echo "Waiting for model $MODEL_NAME to be available in /v1/models... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
	sleep 5
	ATTEMPT=$((ATTEMPT + 1))
	done
	if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
	echo "Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts"
	echo "Last response: $MODELS_RESPONSE"
	exit 1
	fi
	RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused -X POST "${LLM_URL}/v1/chat/completions" \
	-H 'accept: text/event-stream' \
	-H 'Content-Type: application/json' \
	-d '{
	"model": "'"${MODEL_NAME:-Qwen/Qwen3-0.6B}"'",
	"messages": [
	{
	"role": "user",
	"content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
	}
	],
	"stream":false,
	"max_tokens": 30,
	"temperature": 0.0
	}' 2>&1)
	echo "Response: $RESPONSE"
	TEST_RESULT=0
	if ! echo "$RESPONSE" \| jq -e . >/dev/null 2>&1; then
	echo "Test failed: Response is not valid JSON"
	echo "Got: $RESPONSE"
	TEST_RESULT=1
	elif ! echo "$RESPONSE" \| jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
	echo "Test failed: Message role is not 'assistant'"
	echo "Got: $(echo "$RESPONSE" \| jq '.choices[0].message.role')"
	TEST_RESULT=1
	elif ! echo "$RESPONSE" \| jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
	echo "Test failed: Model name is incorrect"
	echo "Got: $(echo "$RESPONSE" \| jq '.model')"
	TEST_RESULT=1
	elif ! echo "$RESPONSE" \| jq -e '.choices[0].message.content \| length > 100' >/dev/null 2>&1; then
	echo "Test failed: Response content length is not greater than 100 characters"
	echo "Got length: $(echo "$RESPONSE" \| jq '.choices[0].message.content \| length')"
	TEST_RESULT=1
	else
	echo "Test passed: Response matches expected format and content"
	fi
	exit $TEST_RESULT
	- name: Cleanup
	if: always()
	timeout-minutes: 5
	env:
	NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	run: \|
	set -x
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE

	# For debugging purposes, list all the resources before we delete
	kubectl get all

	echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..."
	kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE \|\| true

	deploy-test-sglang:
	runs-on: cpu-amd-m5-2xlarge
	# TODO: Uncomment this when we have a way to test the deploy-test-sglang job in CI.
	#if: needs.changed-files.outputs.has_code_changes == 'true'
	if: github.event.inputs.run_deploy_operator
	needs: [changed-files, deploy-operator, sglang]
	permissions:
	contents: read
	strategy:
	fail-fast: false
	max-parallel: 1
	matrix:
	profile:
	- agg
	- agg_router
	name: deploy-test-sglang (${{ matrix.profile }})
	env:
	FRAMEWORK: sglang
	DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
	DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
	MODEL_NAME: "Qwen/Qwen3-0.6B"
	steps: *deploy-test-steps

	deploy-test-trtllm:
	runs-on: cpu-amd-m5-2xlarge
	# TODO: Uncomment this when we have a way to test the deploy-test-trtllm job in CI.
	#if: needs.changed-files.outputs.has_code_changes == 'true'
	if: github.event.inputs.run_deploy_operator
	needs: [changed-files, deploy-operator, trtllm]
	permissions:
	contents: read
	strategy:
	fail-fast: false
	max-parallel: 1
	matrix:
	profile:
	- agg
	- agg_router
	- disagg
	- disagg_router
	name: deploy-test-trtllm (${{ matrix.profile }})
	env:
	FRAMEWORK: trtllm
	DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
	DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
	MODEL_NAME: "Qwen/Qwen3-0.6B"
	steps: *deploy-test-steps

	cleanup:
	runs-on: cpu-amd-m5-2xlarge
	# TODO: Uncomment the below if statement when we have a way to test the cleanup job in CI.
	# if: always()
	if: github.event.inputs.run_deploy_operator
	needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
	steps:
	- name: Output Node Name
	shell: bash
	run: \|
	echo ${K8S_NODE_NAME}
	- uses: actions/checkout@v4
	- name: Setup Kubeconfig
	env:
	NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	run: \|
	set -x
	# Setup kubeconfig
	echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" \| base64 -d > .kubeconfig
	chmod 600 .kubeconfig
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
	kubectl config current-context
	- name: Cleanup
	timeout-minutes: 5
	env:
	NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
	run: \|
	set -x
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE

	echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" \| base64 -d > .kubeconfig
	chmod 600 .kubeconfig
	export KUBECONFIG=$(pwd)/.kubeconfig
	kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"

	# For debugging purposes, list all the resources before we uninstall
	kubectl get all

	echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
	kubectl delete dynamographdeployments --all -n $NAMESPACE \|\| true

	# Uninstall the helm chart
	helm ls
	helm uninstall dynamo-platform --namespace $NAMESPACE \|\| true

	echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
	kubectl delete namespace $NAMESPACE \|\| true
	echo "Namespace $NAMESPACE completed."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Merge branch 'main' into fix/sglang-multimodal-worker-registration #7327

Workflow file

Merge branch 'main' into fix/sglang-multimodal-worker-registration #7327

Uh oh!

Jobs

Run details

Workflow file for this run