Fix port allocation race conditions in cancellation tests

keivenchang · keivenchang · commit 3d3500645765 · 2025-11-05T10:56:01.000-08:00
Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/tests/fault_tolerance/cancellation/test_sglang.py b/tests/fault_tolerance/cancellation/test_sglang.py
@@ -17,7 +17,7 @@
 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.payloads import check_health_generate, check_models_api
-from tests.utils.port_utils import get_free_port
+from tests.utils.port_utils import get_free_ports
 
 logger = logging.getLogger(__name__)
 
@@ -168,15 +168,18 @@ def test_request_cancellation_sglang_aggregated(
     See: https://github.com/sgl-project/sglang/issues/11139
     """
     logger.info("Sanity check if latest test is getting executed")
+
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, system_port = get_free_ports(2)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start an aggregated worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=system_port,
             frontend_port=frontend_port,
             mode="agg",
         ) as worker:
@@ -261,15 +264,17 @@ def test_request_cancellation_sglang_decode_cancel(
     Note: This test requires 2 GPUs to run decode and prefill workers on separate GPUs.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, decode_system_port, prefill_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the decode worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=decode_system_port,
             frontend_port=frontend_port,
             mode="decode",
         ) as decode_worker:
@@ -278,7 +283,7 @@ def test_request_cancellation_sglang_decode_cancel(
             # Step 3: Start the prefill worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=prefill_system_port,
                 frontend_port=frontend_port,
                 mode="prefill",
             ) as prefill_worker:
diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -17,7 +17,7 @@
 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.payloads import check_health_generate, check_models_api
-from tests.utils.port_utils import get_free_port
+from tests.utils.port_utils import get_free_ports
 
 logger = logging.getLogger(__name__)
 
@@ -151,15 +151,17 @@ def test_request_cancellation_trtllm_aggregated(
     on the worker side in aggregated (prefill_and_decode) mode.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, system_port = get_free_ports(2)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start an aggregated worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=system_port,
             frontend_port=frontend_port,
             mode="prefill_and_decode",
         ) as worker:
@@ -234,15 +236,17 @@ def test_request_cancellation_trtllm_disagg_decode_cancel(
     on the decode worker side in a disaggregated setup.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, prefill_system_port, decode_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the prefill worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=prefill_system_port,
             frontend_port=frontend_port,
             mode="prefill",
             strategy="decode_first",
@@ -252,7 +256,7 @@ def test_request_cancellation_trtllm_disagg_decode_cancel(
             # Step 3: Start the decode worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=decode_system_port,
                 frontend_port=frontend_port,
                 mode="decode",
                 strategy="decode_first",
@@ -325,15 +329,17 @@ def test_request_cancellation_trtllm_disagg_prefill_cancel(
     Since the request is cancelled before prefill completes, the decode worker never receives it.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, prefill_system_port, decode_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the prefill worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=prefill_system_port,
             frontend_port=frontend_port,
             mode="prefill",
             strategy="decode_first",
@@ -343,7 +349,7 @@ def test_request_cancellation_trtllm_disagg_prefill_cancel(
             # Step 3: Start the decode worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=decode_system_port,
                 frontend_port=frontend_port,
                 mode="decode",
                 strategy="decode_first",
@@ -420,15 +426,17 @@ def test_request_cancellation_trtllm_prefill_first_prefill_cancel(
     on the prefill worker side in a disaggregated setup using prefill_first strategy.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, decode_system_port, prefill_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the decode worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=decode_system_port,
             frontend_port=frontend_port,
             mode="decode",
             strategy="prefill_first",
@@ -438,7 +446,7 @@ def test_request_cancellation_trtllm_prefill_first_prefill_cancel(
             # Step 3: Start the prefill worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=prefill_system_port,
                 frontend_port=frontend_port,
                 mode="prefill",
                 strategy="prefill_first",
@@ -502,15 +510,17 @@ def test_request_cancellation_trtllm_prefill_first_remote_decode_cancel(
     on both the prefill and decode workers in a disaggregated setup using prefill_first strategy.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, decode_system_port, prefill_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the decode worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=decode_system_port,
             frontend_port=frontend_port,
             mode="decode",
             strategy="prefill_first",
@@ -520,7 +530,7 @@ def test_request_cancellation_trtllm_prefill_first_remote_decode_cancel(
             # Step 3: Start the prefill worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=prefill_system_port,
                 frontend_port=frontend_port,
                 mode="prefill",
                 strategy="prefill_first",
diff --git a/tests/fault_tolerance/cancellation/test_vllm.py b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -16,7 +16,7 @@
 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.payloads import check_health_generate, check_models_api
-from tests.utils.port_utils import get_free_port
+from tests.utils.port_utils import get_free_ports
 
 logger = logging.getLogger(__name__)
 
@@ -139,14 +139,16 @@ def test_request_cancellation_vllm_aggregated(
     3. Chat completion request (streaming)
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, system_port = get_free_ports(2)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start a single worker
         with DynamoWorkerProcess(
-            request, system_port=get_free_port(), frontend_port=frontend_port
+            request, system_port=system_port, frontend_port=frontend_port
         ) as worker:
             logger.info(f"Worker PID: {worker.get_pid()}")
 
@@ -216,15 +218,17 @@ def test_request_cancellation_vllm_decode_cancel(
     on the decode worker side in a disaggregated setup.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, prefill_system_port, decode_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the prefill worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=prefill_system_port,
             frontend_port=frontend_port,
             is_prefill=True,
         ) as prefill_worker:
@@ -233,7 +237,7 @@ def test_request_cancellation_vllm_decode_cancel(
             # Step 3: Start the decode worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=decode_system_port,
                 frontend_port=frontend_port,
                 is_prefill=False,
             ) as decode_worker:
@@ -302,15 +306,17 @@ def test_request_cancellation_vllm_remote_prefill_cancel(
     on both the decode and prefill workers in a disaggregated setup.
     """
 
+    # Allocate all ports upfront to avoid race conditions
+    frontend_port, prefill_system_port, decode_system_port = get_free_ports(3)
+
     # Step 1: Start the frontend
-    frontend_port = get_free_port()
     with DynamoFrontendProcess(request, frontend_port) as frontend:
         logger.info("Frontend started successfully")
 
         # Step 2: Start the prefill worker
         with DynamoWorkerProcess(
             request,
-            system_port=get_free_port(),
+            system_port=prefill_system_port,
             frontend_port=frontend_port,
             is_prefill=True,
         ) as prefill_worker:
@@ -319,7 +325,7 @@ def test_request_cancellation_vllm_remote_prefill_cancel(
             # Step 3: Start the decode worker
             with DynamoWorkerProcess(
                 request,
-                system_port=get_free_port(),
+                system_port=decode_system_port,
                 frontend_port=frontend_port,
                 is_prefill=False,
             ) as decode_worker: