From 6621766a8653382a35a7219cc6adcb4db89a5cb8 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Mon, 8 Dec 2025 15:25:41 -0800 Subject: [PATCH 1/3] test: Auto pre-download model before Cancellation and Migration test begin Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com> --- tests/conftest.py | 21 +++++++++++++++++++ .../cancellation/test_sglang.py | 8 ++----- .../cancellation/test_trtllm.py | 16 ++++---------- .../fault_tolerance/cancellation/test_vllm.py | 8 +++---- .../fault_tolerance/migration/test_sglang.py | 8 +++---- .../fault_tolerance/migration/test_trtllm.py | 8 +++---- tests/fault_tolerance/migration/test_vllm.py | 8 +++---- 7 files changed, 42 insertions(+), 35 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e4a4b562a6..43570b6e53 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ import os import shutil import tempfile +import time from pathlib import Path from typing import Optional @@ -226,6 +227,26 @@ def pytest_collection_modifyitems(config, items): config.models_to_download = models_to_download +def pytest_runtestloop(session): + """Download models after collection but before any tests run. + + This hook runs after pytest_collection_modifyitems (so models are collected) + but before any test execution, ensuring model downloads don't count against test timeouts. + """ + models = getattr(session.config, "models_to_download", None) + + if models: + logging.info( + f"Downloading {len(models)} models before test execution\nModels: {models}" + ) + start_time = time.time() + + download_models(model_list=list(models)) + + download_duration = time.time() - start_time + logging.info(f"Model download completed in {download_duration:.1f}s") + + class EtcdServer(ManagedProcess): def __init__(self, request, port=2379, timeout=300): port_string = str(port) diff --git a/tests/fault_tolerance/cancellation/test_sglang.py b/tests/fault_tolerance/cancellation/test_sglang.py index 161f7e72d2..55058f01d1 100644 --- a/tests/fault_tolerance/cancellation/test_sglang.py +++ b/tests/fault_tolerance/cancellation/test_sglang.py @@ -161,9 +161,7 @@ def is_ready(self, response) -> bool: @pytest.mark.timeout(160) # 3x average @pytest.mark.gpu_1 @pytest.mark.xfail(strict=False) -def test_request_cancellation_sglang_aggregated( - request, runtime_services, predownload_models -): +def test_request_cancellation_sglang_aggregated(request, runtime_services): """ End-to-end test for request cancellation functionality in aggregated mode. @@ -247,9 +245,7 @@ def test_request_cancellation_sglang_aggregated( @pytest.mark.timeout(185) # 3x average @pytest.mark.gpu_2 -def test_request_cancellation_sglang_decode_cancel( - request, runtime_services, predownload_models -): +def test_request_cancellation_sglang_decode_cancel(request, runtime_services): """ End-to-end test for request cancellation during decode phase. diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py index 168e543554..87e679f40d 100644 --- a/tests/fault_tolerance/cancellation/test_trtllm.py +++ b/tests/fault_tolerance/cancellation/test_trtllm.py @@ -141,9 +141,7 @@ def is_ready(self, response) -> bool: @pytest.mark.timeout(140) # 3x average -def test_request_cancellation_trtllm_aggregated( - request, runtime_services, predownload_models -): +def test_request_cancellation_trtllm_aggregated(request, runtime_services): """ End-to-end test for request cancellation functionality in aggregated mode. @@ -215,9 +213,7 @@ def test_request_cancellation_trtllm_aggregated( @pytest.mark.timeout(350) # 3x average -def test_request_cancellation_trtllm_decode_cancel( - request, runtime_services, predownload_models -): +def test_request_cancellation_trtllm_decode_cancel(request, runtime_services): """ End-to-end test for request cancellation during decode phase with unified frontend. @@ -288,9 +284,7 @@ def test_request_cancellation_trtllm_decode_cancel( @pytest.mark.timeout(350) # 3x average -def test_request_cancellation_trtllm_prefill_cancel( - request, runtime_services, predownload_models -): +def test_request_cancellation_trtllm_prefill_cancel(request, runtime_services): """ End-to-end test for request cancellation during prefill phase with unified frontend. @@ -375,9 +369,7 @@ def test_request_cancellation_trtllm_prefill_cancel( reason="May fail due to unknown reason with TRT-LLM or backend implementation", strict=False, ) -def test_request_cancellation_trtllm_kv_transfer_cancel( - request, runtime_services, predownload_models -): +def test_request_cancellation_trtllm_kv_transfer_cancel(request, runtime_services): """ End-to-end test for request cancellation during prefill to decode KV transfer phase. diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py index bd1fa4eef5..01bde0f314 100644 --- a/tests/fault_tolerance/cancellation/test_vllm.py +++ b/tests/fault_tolerance/cancellation/test_vllm.py @@ -134,9 +134,7 @@ def is_ready(self, response) -> bool: @pytest.mark.timeout(110) # 3x average -def test_request_cancellation_vllm_aggregated( - request, runtime_services, predownload_models -): +def test_request_cancellation_vllm_aggregated(request, runtime_services): """ End-to-end test for request cancellation functionality in aggregated mode. @@ -209,7 +207,7 @@ def test_request_cancellation_vllm_aggregated( @pytest.mark.timeout(150) # 3x average def test_request_cancellation_vllm_decode_cancel( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for request cancellation during decode phase. @@ -279,7 +277,7 @@ def test_request_cancellation_vllm_decode_cancel( @pytest.mark.timeout(150) # 3x average def test_request_cancellation_vllm_prefill_cancel( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for request cancellation during prefill phase. diff --git a/tests/fault_tolerance/migration/test_sglang.py b/tests/fault_tolerance/migration/test_sglang.py index 9a6298fb25..e41da95b59 100644 --- a/tests/fault_tolerance/migration/test_sglang.py +++ b/tests/fault_tolerance/migration/test_sglang.py @@ -115,7 +115,7 @@ def is_ready(self, response) -> bool: @pytest.mark.timeout(235) # 3x average def test_request_migration_sglang_worker_failure( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with migration support using SGLang. @@ -159,7 +159,7 @@ def test_request_migration_sglang_worker_failure( @pytest.mark.skip(reason="SGLang graceful shutdown not yet implemented") def test_request_migration_sglang_graceful_shutdown( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with graceful shutdown and migration support using SGLang. @@ -207,7 +207,7 @@ def test_request_migration_sglang_graceful_shutdown( @pytest.mark.timeout(135) # 3x average def test_no_request_migration_sglang_worker_failure( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with migration disabled using SGLang. @@ -267,7 +267,7 @@ def test_no_request_migration_sglang_worker_failure( @pytest.mark.skip(reason="SGLang graceful shutdown not yet implemented") def test_no_request_migration_sglang_graceful_shutdown( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with graceful shutdown and migration disabled using SGLang. diff --git a/tests/fault_tolerance/migration/test_trtllm.py b/tests/fault_tolerance/migration/test_trtllm.py index 180188415b..0b0da3a8b1 100644 --- a/tests/fault_tolerance/migration/test_trtllm.py +++ b/tests/fault_tolerance/migration/test_trtllm.py @@ -111,7 +111,7 @@ def is_ready(self, response) -> bool: @pytest.mark.timeout(290) # 3x average def test_request_migration_trtllm_worker_failure( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with migration support using TRT-LLM. @@ -155,7 +155,7 @@ def test_request_migration_trtllm_worker_failure( @pytest.mark.skip(reason="TRT-LLM graceful shutdown not yet implemented") def test_request_migration_trtllm_graceful_shutdown( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with graceful shutdown and migration support using TRT-LLM. @@ -203,7 +203,7 @@ def test_request_migration_trtllm_graceful_shutdown( @pytest.mark.timeout(185) # 3x average def test_no_request_migration_trtllm_worker_failure( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with migration disabled using TRT-LLM. @@ -263,7 +263,7 @@ def test_no_request_migration_trtllm_worker_failure( @pytest.mark.skip(reason="TRT-LLM graceful shutdown not yet implemented") def test_no_request_migration_trtllm_graceful_shutdown( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with graceful shutdown and migration disabled using TRT-LLM. diff --git a/tests/fault_tolerance/migration/test_vllm.py b/tests/fault_tolerance/migration/test_vllm.py index 5aa06a8d61..336cfb041b 100644 --- a/tests/fault_tolerance/migration/test_vllm.py +++ b/tests/fault_tolerance/migration/test_vllm.py @@ -115,7 +115,7 @@ def is_ready(self, response) -> bool: @pytest.mark.timeout(290) # 3x average def test_request_migration_vllm_worker_failure( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with migration support. @@ -159,7 +159,7 @@ def test_request_migration_vllm_worker_failure( @pytest.mark.timeout(280) # 3x average def test_request_migration_vllm_graceful_shutdown( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with graceful shutdown and migration support. @@ -207,7 +207,7 @@ def test_request_migration_vllm_graceful_shutdown( @pytest.mark.timeout(150) # 3x average def test_no_request_migration_vllm_worker_failure( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with migration disabled. @@ -267,7 +267,7 @@ def test_no_request_migration_vllm_worker_failure( @pytest.mark.timeout(140) # 3x average def test_no_request_migration_vllm_graceful_shutdown( - request, runtime_services, predownload_models, set_ucx_tls_no_mm + request, runtime_services, set_ucx_tls_no_mm ): """ End-to-end test for worker fault tolerance with graceful shutdown and migration disabled. From 6924769437600597b578e8062db8c11280bf76b8 Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Mon, 8 Dec 2025 15:38:58 -0800 Subject: [PATCH 2/3] tmp: Run all Cancellation / Migration tests pre_merge Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com> --- tests/fault_tolerance/cancellation/test_sglang.py | 2 +- tests/fault_tolerance/cancellation/test_trtllm.py | 2 +- tests/fault_tolerance/cancellation/test_vllm.py | 2 +- tests/fault_tolerance/migration/test_sglang.py | 2 +- tests/fault_tolerance/migration/test_trtllm.py | 2 +- tests/fault_tolerance/migration/test_vllm.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/fault_tolerance/cancellation/test_sglang.py b/tests/fault_tolerance/cancellation/test_sglang.py index 55058f01d1..cb761cebf7 100644 --- a/tests/fault_tolerance/cancellation/test_sglang.py +++ b/tests/fault_tolerance/cancellation/test_sglang.py @@ -25,7 +25,7 @@ pytest.mark.sglang, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.post_merge, # post_merge to pinpoint failure commit + pytest.mark.pre_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py index 87e679f40d..3640a93c75 100644 --- a/tests/fault_tolerance/cancellation/test_trtllm.py +++ b/tests/fault_tolerance/cancellation/test_trtllm.py @@ -26,7 +26,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.post_merge, # post_merge to pinpoint failure commit + pytest.mark.pre_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py index 01bde0f314..802f5d25d4 100644 --- a/tests/fault_tolerance/cancellation/test_vllm.py +++ b/tests/fault_tolerance/cancellation/test_vllm.py @@ -25,7 +25,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.post_merge, # post_merge to pinpoint failure commit + pytest.mark.pre_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/migration/test_sglang.py b/tests/fault_tolerance/migration/test_sglang.py index e41da95b59..dd1c29ae67 100644 --- a/tests/fault_tolerance/migration/test_sglang.py +++ b/tests/fault_tolerance/migration/test_sglang.py @@ -28,7 +28,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.post_merge, # post_merge to pinpoint failure commit + pytest.mark.pre_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/migration/test_trtllm.py b/tests/fault_tolerance/migration/test_trtllm.py index 0b0da3a8b1..f3878cbeef 100644 --- a/tests/fault_tolerance/migration/test_trtllm.py +++ b/tests/fault_tolerance/migration/test_trtllm.py @@ -28,7 +28,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.post_merge, # post_merge to pinpoint failure commit + pytest.mark.pre_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/migration/test_vllm.py b/tests/fault_tolerance/migration/test_vllm.py index 336cfb041b..ff525d5ae4 100644 --- a/tests/fault_tolerance/migration/test_vllm.py +++ b/tests/fault_tolerance/migration/test_vllm.py @@ -28,7 +28,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.post_merge, # post_merge to pinpoint failure commit + pytest.mark.pre_merge, # post_merge to pinpoint failure commit ] From 6144d78404f91c012cda7dacda0f4837ba41eeeb Mon Sep 17 00:00:00 2001 From: Jacky <18255193+kthui@users.noreply.github.com> Date: Mon, 8 Dec 2025 16:38:09 -0800 Subject: [PATCH 3/3] Revert "tmp: Run all Cancellation / Migration tests pre_merge" This reverts commit 6924769437600597b578e8062db8c11280bf76b8. Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com> --- tests/fault_tolerance/cancellation/test_sglang.py | 2 +- tests/fault_tolerance/cancellation/test_trtllm.py | 2 +- tests/fault_tolerance/cancellation/test_vllm.py | 2 +- tests/fault_tolerance/migration/test_sglang.py | 2 +- tests/fault_tolerance/migration/test_trtllm.py | 2 +- tests/fault_tolerance/migration/test_vllm.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/fault_tolerance/cancellation/test_sglang.py b/tests/fault_tolerance/cancellation/test_sglang.py index cb761cebf7..55058f01d1 100644 --- a/tests/fault_tolerance/cancellation/test_sglang.py +++ b/tests/fault_tolerance/cancellation/test_sglang.py @@ -25,7 +25,7 @@ pytest.mark.sglang, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.pre_merge, # post_merge to pinpoint failure commit + pytest.mark.post_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py index 3640a93c75..87e679f40d 100644 --- a/tests/fault_tolerance/cancellation/test_trtllm.py +++ b/tests/fault_tolerance/cancellation/test_trtllm.py @@ -26,7 +26,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.pre_merge, # post_merge to pinpoint failure commit + pytest.mark.post_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py index 802f5d25d4..01bde0f314 100644 --- a/tests/fault_tolerance/cancellation/test_vllm.py +++ b/tests/fault_tolerance/cancellation/test_vllm.py @@ -25,7 +25,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.pre_merge, # post_merge to pinpoint failure commit + pytest.mark.post_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/migration/test_sglang.py b/tests/fault_tolerance/migration/test_sglang.py index dd1c29ae67..e41da95b59 100644 --- a/tests/fault_tolerance/migration/test_sglang.py +++ b/tests/fault_tolerance/migration/test_sglang.py @@ -28,7 +28,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.pre_merge, # post_merge to pinpoint failure commit + pytest.mark.post_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/migration/test_trtllm.py b/tests/fault_tolerance/migration/test_trtllm.py index f3878cbeef..0b0da3a8b1 100644 --- a/tests/fault_tolerance/migration/test_trtllm.py +++ b/tests/fault_tolerance/migration/test_trtllm.py @@ -28,7 +28,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.pre_merge, # post_merge to pinpoint failure commit + pytest.mark.post_merge, # post_merge to pinpoint failure commit ] diff --git a/tests/fault_tolerance/migration/test_vllm.py b/tests/fault_tolerance/migration/test_vllm.py index ff525d5ae4..336cfb041b 100644 --- a/tests/fault_tolerance/migration/test_vllm.py +++ b/tests/fault_tolerance/migration/test_vllm.py @@ -28,7 +28,7 @@ pytest.mark.gpu_1, pytest.mark.e2e, pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), - pytest.mark.pre_merge, # post_merge to pinpoint failure commit + pytest.mark.post_merge, # post_merge to pinpoint failure commit ]