Skip to content

Commit c6b440e

Browse files
authored
test: add dynamic port allocation for fault_tolerant test execution (#4835)
Signed-off-by: Keiven Chang <[email protected]> Co-authored-by: Keiven Chang <[email protected]>
1 parent 111e08c commit c6b440e

File tree

13 files changed

+1104
-234
lines changed

13 files changed

+1104
-234
lines changed

tests/conftest.py

Lines changed: 132 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,5 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
3-
#
4-
# Licensed under the Apache License, Version 2.0 (the "License");
5-
# you may not use this file except in compliance with the License.
6-
# You may obtain a copy of the License at
7-
#
8-
# http://www.apache.org/licenses/LICENSE-2.0
9-
#
10-
# Unless required by applicable law or agreed to in writing, software
11-
# distributed under the License is distributed on an "AS IS" BASIS,
12-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
# See the License for the specific language governing permissions and
14-
# limitations under the License.
153

164
import logging
175
import os
@@ -26,6 +14,14 @@
2614

2715
from tests.utils.constants import TEST_MODELS
2816
from tests.utils.managed_process import ManagedProcess
17+
from tests.utils.port_utils import (
18+
allocate_port,
19+
allocate_ports,
20+
deallocate_port,
21+
deallocate_ports,
22+
)
23+
24+
_logger = logging.getLogger(__name__)
2925

3026

3127
def pytest_configure(config):
@@ -249,43 +245,118 @@ def pytest_runtestloop(session):
249245

250246
class EtcdServer(ManagedProcess):
251247
def __init__(self, request, port=2379, timeout=300):
248+
# Allocate free ports if port is 0
249+
use_random_port = port == 0
250+
if use_random_port:
251+
# Need two ports: client port and peer port for parallel execution
252+
# Start from 2380 (etcd default 2379 + 1)
253+
port, peer_port = allocate_ports(2, 2380)
254+
else:
255+
peer_port = None
256+
257+
self.port = port
258+
self.peer_port = peer_port # Store for cleanup
259+
self.use_random_port = use_random_port # Track if we allocated the port
252260
port_string = str(port)
253261
etcd_env = os.environ.copy()
254262
etcd_env["ALLOW_NONE_AUTHENTICATION"] = "yes"
255263
data_dir = tempfile.mkdtemp(prefix="etcd_")
264+
256265
command = [
257266
"etcd",
258267
"--listen-client-urls",
259268
f"http://0.0.0.0:{port_string}",
260269
"--advertise-client-urls",
261270
f"http://0.0.0.0:{port_string}",
262-
"--data-dir",
263-
data_dir,
264271
]
272+
273+
# Add peer port configuration only for random ports (parallel execution)
274+
if peer_port is not None:
275+
peer_port_string = str(peer_port)
276+
command.extend(
277+
[
278+
"--listen-peer-urls",
279+
f"http://0.0.0.0:{peer_port_string}",
280+
"--initial-advertise-peer-urls",
281+
f"http://localhost:{peer_port_string}",
282+
"--initial-cluster",
283+
f"default=http://localhost:{peer_port_string}",
284+
]
285+
)
286+
287+
command.extend(
288+
[
289+
"--data-dir",
290+
data_dir,
291+
]
292+
)
265293
super().__init__(
266294
env=etcd_env,
267295
command=command,
268296
timeout=timeout,
269297
display_output=False,
298+
terminate_existing=not use_random_port, # Disabled for parallel test execution with random ports
270299
health_check_ports=[port],
271300
data_dir=data_dir,
272301
log_dir=request.node.name,
273302
)
274303

304+
def __exit__(self, exc_type, exc_val, exc_tb):
305+
"""Release allocated ports when server exits."""
306+
try:
307+
# Only deallocate ports that were dynamically allocated (not default ports)
308+
if self.use_random_port:
309+
ports_to_release = [self.port]
310+
if self.peer_port is not None:
311+
ports_to_release.append(self.peer_port)
312+
deallocate_ports(ports_to_release)
313+
except Exception as e:
314+
logging.warning(f"Failed to release EtcdServer port: {e}")
315+
316+
return super().__exit__(exc_type, exc_val, exc_tb)
317+
275318

276319
class NatsServer(ManagedProcess):
277320
def __init__(self, request, port=4222, timeout=300):
321+
# Allocate a free port if port is 0
322+
use_random_port = port == 0
323+
if use_random_port:
324+
# Start from 4223 (nats-server default 4222 + 1)
325+
port = allocate_port(4223)
326+
327+
self.port = port
328+
self.use_random_port = use_random_port # Track if we allocated the port
278329
data_dir = tempfile.mkdtemp(prefix="nats_")
279-
command = ["nats-server", "-js", "--trace", "--store_dir", data_dir]
330+
command = [
331+
"nats-server",
332+
"-js",
333+
"--trace",
334+
"--store_dir",
335+
data_dir,
336+
"-p",
337+
str(port),
338+
]
280339
super().__init__(
281340
command=command,
282341
timeout=timeout,
283342
display_output=False,
343+
terminate_existing=not use_random_port, # Disabled for parallel test execution with random ports
284344
data_dir=data_dir,
285345
health_check_ports=[port],
286346
log_dir=request.node.name,
287347
)
288348

349+
def __exit__(self, exc_type, exc_val, exc_tb):
350+
"""Release allocated port when server exits."""
351+
try:
352+
# Only deallocate ports that were dynamically allocated (not default ports)
353+
if self.use_random_port:
354+
deallocate_port(self.port)
355+
except Exception as e:
356+
logging.warning(f"Failed to release NatsServer port: {e}")
357+
358+
return super().__exit__(exc_type, exc_val, exc_tb)
359+
289360

290361
class SharedManagedProcess:
291362
"""Base class for ManagedProcess with file-based reference counting for multi-process sharing."""
@@ -445,7 +516,10 @@ def runtime_services(request, store_kv, request_plane):
445516
446517
- If store_kv != "etcd", etcd is not started (returns None)
447518
- If request_plane != "nats", NATS is not started (returns None)
519+
520+
Returns a tuple of (nats_process, etcd_process) where each has a .port attribute.
448521
"""
522+
# Port cleanup is now handled in NatsServer and EtcdServer __exit__ methods
449523
if request_plane == "nats" and store_kv == "etcd":
450524
with NatsServer(request) as nats_process:
451525
with EtcdServer(request) as etcd_process:
@@ -460,6 +534,49 @@ def runtime_services(request, store_kv, request_plane):
460534
yield None, None
461535

462536

537+
@pytest.fixture()
538+
def runtime_services_dynamic_ports(request, store_kv, request_plane):
539+
"""Provide NATS and Etcd servers with truly dynamic ports per test.
540+
541+
This fixture actually allocates dynamic ports by passing port=0 to the servers.
542+
It also sets the NATS_SERVER and ETCD_ENDPOINTS environment variables so that
543+
Dynamo processes can find the services on the dynamic ports.
544+
545+
- If store_kv != "etcd", etcd is not started (returns None)
546+
- If request_plane != "nats", NATS is not started (returns None)
547+
548+
Returns a tuple of (nats_process, etcd_process) where each has a .port attribute.
549+
"""
550+
import os
551+
552+
# Port cleanup is now handled in NatsServer and EtcdServer __exit__ methods
553+
if request_plane == "nats" and store_kv == "etcd":
554+
with NatsServer(request, port=0) as nats_process:
555+
with EtcdServer(request, port=0) as etcd_process:
556+
# Set environment variables for Rust/Python runtime to use. Note that xdist (parallel execution)
557+
# will launch isolated tests in a new process, so no need to worry about environment pollution.
558+
os.environ["NATS_SERVER"] = f"nats://localhost:{nats_process.port}"
559+
os.environ["ETCD_ENDPOINTS"] = f"http://localhost:{etcd_process.port}"
560+
561+
yield nats_process, etcd_process
562+
563+
# No test should rely on these variables after the test, but clean up just in case.
564+
os.environ.pop("NATS_SERVER", None)
565+
os.environ.pop("ETCD_ENDPOINTS", None)
566+
elif request_plane == "nats":
567+
with NatsServer(request, port=0) as nats_process:
568+
os.environ["NATS_SERVER"] = f"nats://localhost:{nats_process.port}"
569+
yield nats_process, None
570+
os.environ.pop("NATS_SERVER", None)
571+
elif store_kv == "etcd":
572+
with EtcdServer(request, port=0) as etcd_process:
573+
os.environ["ETCD_ENDPOINTS"] = f"http://localhost:{etcd_process.port}"
574+
yield None, etcd_process
575+
os.environ.pop("ETCD_ENDPOINTS", None)
576+
else:
577+
yield None, None
578+
579+
463580
@pytest.fixture(scope="session")
464581
def runtime_services_session(request, tmp_path_factory):
465582
"""Session-scoped fixture that provides shared NATS and etcd instances for all tests.

0 commit comments

Comments
 (0)