Skip to content

Commit dbdf392

Browse files
committed
Merge remote-tracking branch 'origin' into mm_docs
2 parents 73ed7ae + 1e37c10 commit dbdf392

File tree

55 files changed

+3067
-842
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+3067
-842
lines changed

.github/actions/docker-build/action.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,26 @@ runs:
151151
# Exit with the build's exit code
152152
exit ${BUILD_EXIT_CODE}
153153
154+
- name: Run Sanity Check on Runtime Image
155+
if: inputs.target == 'runtime'
156+
shell: bash
157+
run: |
158+
IMAGE_TAG="${{ steps.build.outputs.image_tag }}"
159+
echo "Running sanity check on image: $IMAGE_TAG"
160+
161+
# Run the sanity check script inside the container
162+
# The script is located in /workspace/deploy/sanity_check.py in runtime containers
163+
set +e
164+
docker run --rm "$IMAGE_TAG" python /workspace/deploy/sanity_check.py --runtime-check --no-gpu-check
165+
SANITY_CHECK_EXIT_CODE=$?
166+
set -e
167+
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
168+
echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
169+
exit ${SANITY_CHECK_EXIT_CODE}
170+
else
171+
echo "✅ Sanity check passed"
172+
fi
173+
154174
- name: Capture Build Metrics
155175
id: metrics
156176
shell: bash

.github/actions/pytest/action.yml

Lines changed: 20 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -77,23 +77,31 @@ runs:
7777
fi
7878
fi
7979
80-
# Run without --rm so we can copy results even if container crashes (example SIGSEGV exit 139)
81-
docker run ${GPU_FLAGS} -w /workspace \
80+
# Get absolute path for test-results directory and ensure it has proper permissions
81+
TEST_RESULTS_DIR="$(pwd)/test-results"
82+
chmod 777 "${TEST_RESULTS_DIR}"
83+
echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"
84+
85+
docker run ${GPU_FLAGS} --rm -w /workspace \
8286
--cpus=${NUM_CPUS} \
8387
--network host \
8488
--name ${{ env.CONTAINER_ID }}_pytest \
89+
-v "${TEST_RESULTS_DIR}:/workspace/test-results" \
8590
${{ inputs.image_tag }} \
86-
bash -c "mkdir -p /workspace/test-results && ${PYTEST_CMD}"
91+
bash -c "${PYTEST_CMD}"
8792
8893
TEST_EXIT_CODE=$?
8994
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
9095
echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
9196
92-
# Copy test results from container to host
93-
docker cp ${{ env.CONTAINER_ID }}_pytest:/workspace/test-results . || echo "Failed to copy test results"
94-
95-
# Clean up container
96-
docker rm -f ${{ env.CONTAINER_ID }}_pytest || echo "Failed to clean up container"
97+
# Verify test results were written (only in normal mode)
98+
if [[ "${{ inputs.dry_run }}" != "true" ]]; then
99+
if [[ -f "${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}" ]]; then
100+
echo "✅ Test results file found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
101+
else
102+
echo "⚠️ Test results file not found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
103+
fi
104+
fi
97105
98106
# Always continue to results processing
99107
exit 0
@@ -107,13 +115,6 @@ runs:
107115
STR_TEST_TYPE=$(echo "${{ inputs.test_type }}" | tr ', ' '_')
108116
echo "STR_TEST_TYPE=${STR_TEST_TYPE}" >> $GITHUB_ENV
109117
110-
# Skip XML processing if in dry-run mode
111-
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
112-
echo "✅ Dry-run mode: Test collection completed"
113-
echo "⏭️ No JUnit XML generated (dry-run mode)"
114-
exit 0
115-
fi
116-
117118
# Check for JUnit XML file and determine test status
118119
JUNIT_FILE="test-results/pytest_test_report.xml"
119120
@@ -125,23 +126,9 @@ runs:
125126
ERROR_TESTS=$(grep -o 'errors="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
126127
echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)"
127128
128-
# Create uniquely named metadata file with step context information
129-
# Use framework-testtype-arch to make it unique per test run
130-
METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.json"
131-
JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.xml"
132-
133129
# Rename XML file to unique name
130+
JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml"
134131
mv "$JUNIT_FILE" "test-results/$JUNIT_NAME"
135-
136-
echo '{' > "$METADATA_FILE"
137-
echo ' "job_name": "${{ github.job }}",' >> "$METADATA_FILE"
138-
echo ' "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE"
139-
echo ' "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE"
140-
echo ' "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE"
141-
echo ' "junit_xml_file": "'"$JUNIT_NAME"'",' >> "$METADATA_FILE"
142-
echo ' "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE"
143-
echo '}' >> "$METADATA_FILE"
144-
echo "📝 Created test metadata file: $METADATA_FILE"
145132
echo "📝 Renamed XML file to: $JUNIT_NAME"
146133
else
147134
echo "⚠️ JUnit XML file not found - test results may not be available for upload"
@@ -155,10 +142,8 @@ runs:
155142
156143
- name: Upload Test Results
157144
uses: actions/upload-artifact@v4
158-
if: always() && inputs.dry_run != 'true' # Skip upload in dry-run mode
145+
if: always() # Always upload test results, even if tests failed
159146
with:
160-
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
161-
path: |
162-
test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
163-
test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
147+
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
148+
path: test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
164149
retention-days: 7

.github/workflows/container-validation-backends.yml

Lines changed: 71 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ jobs:
7676
with:
7777
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
7878
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
79+
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
80+
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
81+
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
7982
- name: Linter
8083
shell: bash
8184
env:
@@ -416,6 +419,7 @@ jobs:
416419
export KUBECONFIG=$(pwd)/.kubeconfig
417420
kubectl config set-context --current --namespace=$NAMESPACE
418421
- name: Run Fault Tolerance Tests
422+
id: run-ft-tests
419423
run: |
420424
set -x
421425
export KUBECONFIG=$(pwd)/.kubeconfig
@@ -437,14 +441,49 @@ jobs:
437441
pip install -r container/deps/requirements.test.txt
438442
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
439443
440-
# Run the pytest command (tests orchestrate K8s, don't need dynamo package)
444+
# Create test-results directory
445+
mkdir -p test-results
446+
447+
# Run the pytest command with JUnit XML output
448+
set +e # Don't exit on test failures
441449
pytest tests/fault_tolerance/deploy/test_deployment.py \
442450
-m 'k8s and fault_tolerance' \
443451
-k '${{ matrix.framework.test_scenario }}' \
444452
-s -v \
445453
--namespace ${NAMESPACE} \
446454
--image ${IMAGE} \
447-
--client-type legacy
455+
--client-type legacy \
456+
--junitxml=test-results/pytest_ft_report.xml \
457+
--tb=short
458+
459+
TEST_EXIT_CODE=$?
460+
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
461+
echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"
462+
463+
exit ${TEST_EXIT_CODE}
464+
continue-on-error: true
465+
466+
- name: Process Fault Tolerance Test Results
467+
if: always()
468+
run: |
469+
set -x
470+
471+
# Rename JUnit XML with unique naming if it exists
472+
if [ -f "test-results/pytest_ft_report.xml" ]; then
473+
mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml"
474+
echo "✅ JUnit XML report renamed with unique identifier"
475+
else
476+
echo "⚠️ JUnit XML report not found"
477+
fi
478+
479+
- name: Upload Fault Tolerance Test Results
480+
uses: actions/upload-artifact@v4
481+
if: always()
482+
with:
483+
name: test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }}
484+
path: test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml
485+
retention-days: 7
486+
448487
- name: Cleanup
449488
if: always()
450489
timeout-minutes: 5
@@ -468,56 +507,6 @@ jobs:
468507
kubectl delete namespace $NAMESPACE || true
469508
echo "Namespace $NAMESPACE completed."
470509
471-
# Upload metrics for this workflow and all its jobs
472-
upload-workflow-metrics:
473-
name: Upload Workflow Metrics
474-
runs-on: gitlab
475-
if: always() # Always run, even if other jobs fail
476-
needs: [backend-status-check] # Wait for the status check which waits for all build jobs
477-
478-
steps:
479-
- name: Check out repository
480-
uses: actions/checkout@v4
481-
482-
- name: Set up Python
483-
uses: actions/setup-python@v4
484-
with:
485-
python-version: '3.x'
486-
487-
- name: Install dependencies
488-
run: |
489-
python -m pip install --upgrade pip
490-
pip install requests
491-
492-
- name: Download build metrics
493-
uses: actions/download-artifact@v4
494-
with:
495-
pattern: build-metrics-*
496-
path: build-metrics/
497-
merge-multiple: true
498-
continue-on-error: true # Don't fail if artifacts don't exist
499-
500-
- name: Download test results
501-
uses: actions/download-artifact@v4
502-
with:
503-
pattern: test-results-*
504-
path: test-results/
505-
merge-multiple: true
506-
continue-on-error: true # Don't fail if artifacts don't exist
507-
508-
- name: Upload Complete Workflow Metrics
509-
env:
510-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
511-
WORKFLOW_INDEX: ${{ secrets.WORKFLOW_INDEX }}
512-
JOB_INDEX: ${{ secrets.JOB_INDEX }}
513-
STEPS_INDEX: ${{ secrets.STEPS_INDEX }}
514-
# Container and test index configuration
515-
CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
516-
TEST_INDEX: ${{ secrets.TEST_INDEX }}
517-
run: |
518-
# Upload complete workflow metrics including container metrics
519-
python3 .github/workflows/upload_complete_workflow_metrics.py
520-
521510
deploy-operator:
522511
runs-on: cpu-amd-m5-2xlarge
523512
# TODO: Uncomment this when we have a way to test the deploy-operator job in CI.
@@ -637,13 +626,17 @@ jobs:
637626
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
638627
kubectl config get-contexts
639628
- name: Run Tests
629+
id: run-tests
640630
env:
641631
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
642632
run: |
643633
set -x
644634
export KUBECONFIG=$(pwd)/.kubeconfig
645635
kubectl config set-context --current --namespace=$NAMESPACE
646636
637+
# Redirect all output to a log file while still showing it
638+
exec > >(tee -a test-output.log) 2>&1
639+
647640
cd examples/backends/$FRAMEWORK
648641
export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
649642
export KUBE_NS=$NAMESPACE
@@ -736,6 +729,32 @@ jobs:
736729
echo "Test passed: Response matches expected format and content"
737730
fi
738731
exit $TEST_RESULT
732+
continue-on-error: true
733+
734+
- name: Process Deployment Test Results
735+
if: always()
736+
run: |
737+
set -x
738+
739+
# Create test-results directory
740+
mkdir -p test-results
741+
742+
# Copy and rename the test output log with unique naming
743+
if [ -f "test-output.log" ]; then
744+
cp test-output.log "test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log"
745+
echo "✅ Test output log copied to test-results/"
746+
else
747+
echo "⚠️ test-output.log not found"
748+
fi
749+
750+
- name: Upload Deployment Test Results
751+
uses: actions/upload-artifact@v4
752+
if: always()
753+
with:
754+
name: test-results-${{ env.FRAMEWORK }}-deploy-${{ matrix.profile }}-amd64-${{ github.run_id }}-${{ job.check_run_id }}
755+
path: test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log
756+
retention-days: 7
757+
739758
- name: Cleanup
740759
if: always()
741760
timeout-minutes: 5

components/src/dynamo/planner/utils/planner_core.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
PrefillInterpolator,
2525
)
2626
from dynamo.planner.utils.pre_swept_results_utils import PreSweptResultsHelper
27-
from dynamo.planner.utils.prometheus import PrometheusAPIClient
27+
from dynamo.planner.utils.prometheus import MetricSource, PrometheusAPIClient
2828
from dynamo.planner.utils.trace_data_extractor import extract_metrics_from_mooncake
2929
from dynamo.runtime import DistributedRuntime
3030
from dynamo.runtime.logging import configure_dynamo_logging
@@ -150,9 +150,20 @@ def __init__(
150150
else:
151151
raise ValueError(f"Invalid environment: {args.environment}")
152152

153+
# Use backend metrics for vLLM (queries vllm:* metrics directly from workers)
154+
# Use frontend metrics for other backends (queries dynamo_frontend_* metrics)
155+
metric_source = (
156+
MetricSource.VLLM
157+
if args.backend.lower() == "vllm"
158+
else MetricSource.FRONTEND
159+
)
160+
logger.info(
161+
f"Initializing Prometheus client with metric_source='{metric_source}' for backend '{args.backend}'"
162+
)
153163
self.prometheus_api_client = PrometheusAPIClient(
154164
args.metric_pulling_prometheus_endpoint,
155165
args.namespace,
166+
metric_source=metric_source,
156167
)
157168

158169
self.num_req_predictor = LOAD_PREDICTORS[args.load_predictor](

0 commit comments

Comments
 (0)