ai-dynamo · kthui · Dec 9, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 9, 2025
@@ -17,6 +17,7 @@
 import os
 import shutil
 import tempfile
+import time
 from pathlib import Path
 from typing import Optional
 
@@ -226,6 +227,26 @@ def pytest_collection_modifyitems(config, items):
         config.models_to_download = models_to_download
 
 
+def pytest_runtestloop(session):
+    """Download models after collection but before any tests run.
+
+    This hook runs after pytest_collection_modifyitems (so models are collected)
+    but before any test execution, ensuring model downloads don't count against test timeouts.
+    """
+    models = getattr(session.config, "models_to_download", None)
+
+    if models:
+        logging.info(
+            f"Downloading {len(models)} models before test execution\nModels: {models}"
+        )
+        start_time = time.time()
+
+        download_models(model_list=list(models))
+
+        download_duration = time.time() - start_time
+        logging.info(f"Model download completed in {download_duration:.1f}s")
+
+
 class EtcdServer(ManagedProcess):
     def __init__(self, request, port=2379, timeout=300):
         port_string = str(port)

@@ -161,9 +161,7 @@ def is_ready(self, response) -> bool:
 @pytest.mark.timeout(160)  # 3x average
 @pytest.mark.gpu_1
 @pytest.mark.xfail(strict=False)
-def test_request_cancellation_sglang_aggregated(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_sglang_aggregated(request, runtime_services):
     """
     End-to-end test for request cancellation functionality in aggregated mode.
 
@@ -247,9 +245,7 @@ def test_request_cancellation_sglang_aggregated(
 
 @pytest.mark.timeout(185)  # 3x average
 @pytest.mark.gpu_2
-def test_request_cancellation_sglang_decode_cancel(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_sglang_decode_cancel(request, runtime_services):
     """
     End-to-end test for request cancellation during decode phase.
 

@@ -141,9 +141,7 @@ def is_ready(self, response) -> bool:
 
 
 @pytest.mark.timeout(140)  # 3x average
-def test_request_cancellation_trtllm_aggregated(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_trtllm_aggregated(request, runtime_services):
     """
     End-to-end test for request cancellation functionality in aggregated mode.
 
@@ -215,9 +213,7 @@ def test_request_cancellation_trtllm_aggregated(
 
 
 @pytest.mark.timeout(350)  # 3x average
-def test_request_cancellation_trtllm_decode_cancel(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_trtllm_decode_cancel(request, runtime_services):
     """
     End-to-end test for request cancellation during decode phase with unified frontend.
 
@@ -288,9 +284,7 @@ def test_request_cancellation_trtllm_decode_cancel(
 
 
 @pytest.mark.timeout(350)  # 3x average
-def test_request_cancellation_trtllm_prefill_cancel(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_trtllm_prefill_cancel(request, runtime_services):
     """
     End-to-end test for request cancellation during prefill phase with unified frontend.
 
@@ -375,9 +369,7 @@ def test_request_cancellation_trtllm_prefill_cancel(
     reason="May fail due to unknown reason with TRT-LLM or backend implementation",
     strict=False,
 )
-def test_request_cancellation_trtllm_kv_transfer_cancel(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_trtllm_kv_transfer_cancel(request, runtime_services):
     """
     End-to-end test for request cancellation during prefill to decode KV transfer phase.
 

@@ -134,9 +134,7 @@ def is_ready(self, response) -> bool:
 
 
 @pytest.mark.timeout(110)  # 3x average
-def test_request_cancellation_vllm_aggregated(
-    request, runtime_services, predownload_models
-):
+def test_request_cancellation_vllm_aggregated(request, runtime_services):
     """
     End-to-end test for request cancellation functionality in aggregated mode.
 
@@ -209,7 +207,7 @@ def test_request_cancellation_vllm_aggregated(
 
 @pytest.mark.timeout(150)  # 3x average
 def test_request_cancellation_vllm_decode_cancel(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for request cancellation during decode phase.
@@ -279,7 +277,7 @@ def test_request_cancellation_vllm_decode_cancel(
 
 @pytest.mark.timeout(150)  # 3x average
 def test_request_cancellation_vllm_prefill_cancel(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for request cancellation during prefill phase.

@@ -115,7 +115,7 @@ def is_ready(self, response) -> bool:
 
 @pytest.mark.timeout(235)  # 3x average
 def test_request_migration_sglang_worker_failure(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration support using SGLang.
@@ -159,7 +159,7 @@ def test_request_migration_sglang_worker_failure(
 
 @pytest.mark.skip(reason="SGLang graceful shutdown not yet implemented")
 def test_request_migration_sglang_graceful_shutdown(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration support using SGLang.
@@ -207,7 +207,7 @@ def test_request_migration_sglang_graceful_shutdown(
 
 @pytest.mark.timeout(135)  # 3x average
 def test_no_request_migration_sglang_worker_failure(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration disabled using SGLang.
@@ -267,7 +267,7 @@ def test_no_request_migration_sglang_worker_failure(
 
 @pytest.mark.skip(reason="SGLang graceful shutdown not yet implemented")
 def test_no_request_migration_sglang_graceful_shutdown(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration disabled using SGLang.

@@ -111,7 +111,7 @@ def is_ready(self, response) -> bool:
 
 @pytest.mark.timeout(290)  # 3x average
 def test_request_migration_trtllm_worker_failure(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration support using TRT-LLM.
@@ -155,7 +155,7 @@ def test_request_migration_trtllm_worker_failure(
 
 @pytest.mark.skip(reason="TRT-LLM graceful shutdown not yet implemented")
 def test_request_migration_trtllm_graceful_shutdown(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration support using TRT-LLM.
@@ -203,7 +203,7 @@ def test_request_migration_trtllm_graceful_shutdown(
 
 @pytest.mark.timeout(185)  # 3x average
 def test_no_request_migration_trtllm_worker_failure(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration disabled using TRT-LLM.
@@ -263,7 +263,7 @@ def test_no_request_migration_trtllm_worker_failure(
 
 @pytest.mark.skip(reason="TRT-LLM graceful shutdown not yet implemented")
 def test_no_request_migration_trtllm_graceful_shutdown(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration disabled using TRT-LLM.

@@ -115,7 +115,7 @@ def is_ready(self, response) -> bool:
 
 @pytest.mark.timeout(290)  # 3x average
 def test_request_migration_vllm_worker_failure(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration support.
@@ -159,7 +159,7 @@ def test_request_migration_vllm_worker_failure(
 
 @pytest.mark.timeout(280)  # 3x average
 def test_request_migration_vllm_graceful_shutdown(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration support.
@@ -207,7 +207,7 @@ def test_request_migration_vllm_graceful_shutdown(
 
 @pytest.mark.timeout(150)  # 3x average
 def test_no_request_migration_vllm_worker_failure(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration disabled.
@@ -267,7 +267,7 @@ def test_no_request_migration_vllm_worker_failure(
 
 @pytest.mark.timeout(140)  # 3x average
 def test_no_request_migration_vllm_graceful_shutdown(
-    request, runtime_services, predownload_models, set_ucx_tls_no_mm
+    request, runtime_services, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration disabled.