Use runtime_services_dynamic_ports for vLLM tests

keivenchang · keivenchang · commit 69f3f7d00c4c · 2025-12-10T05:50:24.000Z
Update vLLM cancellation and migration tests to use
runtime_services_dynamic_ports fixture for fully dynamic NATS/Etcd
ports, enabling true parallel test execution.

- Restore max_tokens to 16384 in send_cancellable_request
- Add TODO to etcd_ha/test_sglang.py for future port update

Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -148,7 +148,7 @@ def is_ready(self, response) -> bool:
 
 
 @pytest.mark.timeout(110)  # 3x average
-def test_request_cancellation_vllm_aggregated(request, runtime_services):
+def test_request_cancellation_vllm_aggregated(request, runtime_services_dynamic_ports):
     """
     End-to-end test for request cancellation functionality in aggregated mode.
 
@@ -232,7 +232,7 @@ def test_request_cancellation_vllm_aggregated(request, runtime_services):
 
 @pytest.mark.timeout(150)  # 3x average
 def test_request_cancellation_vllm_decode_cancel(
-    request, runtime_services, set_ucx_tls_no_mm
+    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for request cancellation during decode phase.
@@ -324,7 +324,7 @@ def test_request_cancellation_vllm_decode_cancel(
 
 @pytest.mark.timeout(150)  # 3x average
 def test_request_cancellation_vllm_prefill_cancel(
-    request, runtime_services, set_ucx_tls_no_mm
+    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for request cancellation during prefill phase.
diff --git a/tests/fault_tolerance/cancellation/utils.py b/tests/fault_tolerance/cancellation/utils.py
@@ -268,11 +268,11 @@ def send_cancellable_request(
         prompt += " Make sure it is" + " long" * 16000 + "!"
 
     if request_type == "completion":
-        return send_completion_request(prompt, 8192, frontend_port)
+        return send_completion_request(prompt, 16384, frontend_port)
     elif request_type == "chat_completion":
-        return send_chat_completion_request(prompt, 8192, frontend_port, stream=False)
+        return send_chat_completion_request(prompt, 16384, frontend_port, stream=False)
     elif request_type == "chat_completion_stream":
-        return send_chat_completion_request(prompt, 8192, frontend_port, stream=True)
+        return send_chat_completion_request(prompt, 16384, frontend_port, stream=True)
     else:
         raise ValueError(f"Unknown request type: {request_type}")
 
diff --git a/tests/fault_tolerance/etcd_ha/test_sglang.py b/tests/fault_tolerance/etcd_ha/test_sglang.py
@@ -1,6 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+# TODO: Update to use dynamic port allocation (allocate_free_port) for parallel execution
+# Currently uses hardcoded ports: FRONTEND_PORT (8000), system ports (8081, 8082)
+# See tests/fault_tolerance/migration/test_sglang.py for dynamic port pattern
+
 import logging
 import os
 import shutil
diff --git a/tests/fault_tolerance/migration/test_vllm.py b/tests/fault_tolerance/migration/test_vllm.py
@@ -128,7 +128,7 @@ def is_ready(self, response) -> bool:
 
 @pytest.mark.timeout(290)  # 3x average
 def test_request_migration_vllm_worker_failure(
-    request, runtime_services, set_ucx_tls_no_mm
+    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration support.
@@ -186,7 +186,7 @@ def test_request_migration_vllm_worker_failure(
 
 @pytest.mark.timeout(280)  # 3x average
 def test_request_migration_vllm_graceful_shutdown(
-    request, runtime_services, set_ucx_tls_no_mm
+    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration support.
@@ -248,7 +248,7 @@ def test_request_migration_vllm_graceful_shutdown(
 
 @pytest.mark.timeout(150)  # 3x average
 def test_no_request_migration_vllm_worker_failure(
-    request, runtime_services, set_ucx_tls_no_mm
+    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with migration disabled.
@@ -326,7 +326,7 @@ def test_no_request_migration_vllm_worker_failure(
 
 @pytest.mark.timeout(140)  # 3x average
 def test_no_request_migration_vllm_graceful_shutdown(
-    request, runtime_services, set_ucx_tls_no_mm
+    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm
 ):
     """
     End-to-end test for worker fault tolerance with graceful shutdown and migration disabled.
diff --git a/tests/fault_tolerance/migration/utils.py b/tests/fault_tolerance/migration/utils.py
@@ -109,7 +109,7 @@ def send_request():
 
 def determine_request_receiving_worker(
     worker1: ManagedProcess, worker2: ManagedProcess, receiving_pattern: str
-) -> tuple[ManagedProcess, str]:
+) -> tuple:
     """
     Determine which worker received the request using parallel polling.
 
@@ -170,10 +170,8 @@ def poll_worker(worker: ManagedProcess, result_list: list[bool]):
         return worker2, "Worker 2"
     elif worker1_received and worker2_received:
         pytest.fail("Both workers received the request")
-        raise AssertionError("Unreachable")  # For mypy: pytest.fail() raises
     else:
         pytest.fail("Neither worker received the request")
-        raise AssertionError("Unreachable")  # For mypy: pytest.fail() raises
 
 
 def validate_completion_response(