Skip to content

Commit 909fb97

Browse files
committed
test: add dynamic port allocation for parallel test execution
Add port allocation utilities and update fault tolerance tests (vLLM, SGLang, TRT-LLM) to use dynamically allocated ports, enabling parallel test execution. - Add tests/utils/port_utils.py for dynamic port allocation - Update conftest.py: EtcdServer/NatsServer support dynamic ports - Move get_pid() to ManagedProcess base class - Update vLLM tests with dynamic port allocation and timing docs - Update SGLang tests with dynamic port allocation and timing docs - Update TRT-LLM tests with dynamic port allocation and timing docs Signed-off-by: Keiven Chang <[email protected]>
1 parent 94d145a commit 909fb97

File tree

11 files changed

+1058
-212
lines changed

11 files changed

+1058
-212
lines changed

tests/conftest.py

Lines changed: 123 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,5 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
3-
#
4-
# Licensed under the Apache License, Version 2.0 (the "License");
5-
# you may not use this file except in compliance with the License.
6-
# You may obtain a copy of the License at
7-
#
8-
# http://www.apache.org/licenses/LICENSE-2.0
9-
#
10-
# Unless required by applicable law or agreed to in writing, software
11-
# distributed under the License is distributed on an "AS IS" BASIS,
12-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
# See the License for the specific language governing permissions and
14-
# limitations under the License.
153

164
import logging
175
import os
@@ -26,6 +14,14 @@
2614

2715
from tests.utils.constants import TEST_MODELS
2816
from tests.utils.managed_process import ManagedProcess
17+
from tests.utils.port_utils import (
18+
allocate_free_port,
19+
allocate_free_ports,
20+
free_port,
21+
free_ports,
22+
)
23+
24+
_logger = logging.getLogger(__name__)
2925

3026

3127
def pytest_configure(config):
@@ -249,43 +245,121 @@ def pytest_runtestloop(session):
249245

250246
class EtcdServer(ManagedProcess):
251247
def __init__(self, request, port=2379, timeout=300):
248+
# Allocate free ports if port is None or 0
249+
use_random_port = port == 0
250+
if use_random_port:
251+
# Need two ports: client port and peer port for parallel execution
252+
# Start from 2380 (etcd default 2379 + 1)
253+
port, peer_port = allocate_free_ports(2, 2380)
254+
else:
255+
peer_port = None
256+
257+
self.port = port
258+
self.peer_port = peer_port # Store for cleanup
252259
port_string = str(port)
253260
etcd_env = os.environ.copy()
254261
etcd_env["ALLOW_NONE_AUTHENTICATION"] = "yes"
255262
data_dir = tempfile.mkdtemp(prefix="etcd_")
263+
256264
command = [
257265
"etcd",
258266
"--listen-client-urls",
259267
f"http://0.0.0.0:{port_string}",
260268
"--advertise-client-urls",
261269
f"http://0.0.0.0:{port_string}",
262-
"--data-dir",
263-
data_dir,
264270
]
271+
272+
# Add peer port configuration only for random ports (parallel execution)
273+
if peer_port is not None:
274+
peer_port_string = str(peer_port)
275+
command.extend(
276+
[
277+
"--listen-peer-urls",
278+
f"http://0.0.0.0:{peer_port_string}",
279+
"--initial-advertise-peer-urls",
280+
f"http://localhost:{peer_port_string}",
281+
"--initial-cluster",
282+
f"default=http://localhost:{peer_port_string}",
283+
]
284+
)
285+
286+
command.extend(
287+
[
288+
"--data-dir",
289+
data_dir,
290+
]
291+
)
265292
super().__init__(
266293
env=etcd_env,
267294
command=command,
268295
timeout=timeout,
269296
display_output=False,
297+
terminate_existing=not use_random_port, # Disabled for parallel test execution with random ports
270298
health_check_ports=[port],
271299
data_dir=data_dir,
272300
log_dir=request.node.name,
273301
)
274302

303+
def __exit__(self, exc_type, exc_val, exc_tb):
304+
"""Release allocated ports when server exits."""
305+
ports_to_release = []
306+
try:
307+
# Release allocated ports BEFORE calling parent __exit__
308+
if hasattr(self, "port") and self.port is not None:
309+
ports_to_release.append(self.port)
310+
if hasattr(self, "peer_port") and self.peer_port is not None:
311+
ports_to_release.append(self.peer_port)
312+
313+
if ports_to_release:
314+
free_ports(ports_to_release)
315+
except Exception as e:
316+
logging.warning(f"Failed to release EtcdServer port: {e}")
317+
finally:
318+
# Always call parent __exit__ to terminate the process
319+
return super().__exit__(exc_type, exc_val, exc_tb)
320+
275321

276322
class NatsServer(ManagedProcess):
277323
def __init__(self, request, port=4222, timeout=300):
324+
# Allocate a free port if port is None or 0
325+
use_random_port = port == 0
326+
if use_random_port:
327+
# Start from 4223 (nats-server default 4222 + 1)
328+
port = allocate_free_port(4223)
329+
330+
self.port = port
278331
data_dir = tempfile.mkdtemp(prefix="nats_")
279-
command = ["nats-server", "-js", "--trace", "--store_dir", data_dir]
332+
command = [
333+
"nats-server",
334+
"-js",
335+
"--trace",
336+
"--store_dir",
337+
data_dir,
338+
"-p",
339+
str(port),
340+
]
280341
super().__init__(
281342
command=command,
282343
timeout=timeout,
283344
display_output=False,
345+
terminate_existing=not use_random_port, # Disabled for parallel test execution with random ports
284346
data_dir=data_dir,
285347
health_check_ports=[port],
286348
log_dir=request.node.name,
287349
)
288350

351+
def __exit__(self, exc_type, exc_val, exc_tb):
352+
"""Release allocated port when server exits."""
353+
try:
354+
# Release allocated port BEFORE calling parent __exit__
355+
if hasattr(self, "port") and self.port is not None:
356+
free_port(self.port)
357+
except Exception as e:
358+
logging.warning(f"Failed to release NatsServer port: {e}")
359+
finally:
360+
# Always call parent __exit__ to terminate the process
361+
return super().__exit__(exc_type, exc_val, exc_tb)
362+
289363

290364
class SharedManagedProcess:
291365
"""Base class for ManagedProcess with file-based reference counting for multi-process sharing."""
@@ -414,11 +488,45 @@ def _create_server(self) -> ManagedProcess:
414488

415489
@pytest.fixture()
416490
def runtime_services(request):
491+
"""Provide NATS and Etcd servers with dynamically allocated ports.
492+
493+
Returns a tuple of (nats_process, etcd_process) where each has a .port attribute.
494+
Tests should set NATS_SERVER and ETCD_ENDPOINTS environment variables in their
495+
subprocess environments using these ports.
496+
"""
497+
# Port cleanup is now handled in NatsServer and EtcdServer __exit__ methods
417498
with NatsServer(request) as nats_process:
418499
with EtcdServer(request) as etcd_process:
419500
yield nats_process, etcd_process
420501

421502

503+
@pytest.fixture()
504+
def runtime_services_dynamic_ports(request):
505+
"""Provide NATS and Etcd servers with truly dynamic ports.
506+
507+
This fixture actually allocates dynamic ports by passing port=0 to the servers.
508+
It also sets the NATS_SERVER and ETCD_ENDPOINTS environment variables so that
509+
Dynamo processes can find the services on the dynamic ports.
510+
511+
Returns a tuple of (nats_process, etcd_process) where each has a .port attribute.
512+
"""
513+
import os
514+
515+
# Port cleanup is now handled in NatsServer and EtcdServer __exit__ methods
516+
with NatsServer(request, port=0) as nats_process:
517+
with EtcdServer(request, port=0) as etcd_process:
518+
# Set environment variables for the dynamic ports.
519+
# xdist (parallel execution) will launch isolated tests in a new process, so no need to worry about environment pollution.
520+
os.environ["NATS_SERVER"] = f"nats://localhost:{nats_process.port}"
521+
os.environ["ETCD_ENDPOINTS"] = f"http://localhost:{etcd_process.port}"
522+
523+
yield nats_process, etcd_process
524+
525+
# Clean up environment variables after test
526+
os.environ.pop("NATS_SERVER", None)
527+
os.environ.pop("ETCD_ENDPOINTS", None)
528+
529+
422530
@pytest.fixture(scope="session")
423531
def runtime_services_session(request, tmp_path_factory):
424532
"""Session-scoped fixture that provides shared NATS and etcd instances for all tests.

0 commit comments

Comments
 (0)