Skip to content

Commit 0fa9b99

Browse files
authored
test: Include a timeout marker to all Cancellation / Migration E2E tests (#4764)
Signed-off-by: Jacky <[email protected]>
1 parent 501ef02 commit 0fa9b99

File tree

6 files changed

+49
-26
lines changed

6 files changed

+49
-26
lines changed

tests/fault_tolerance/cancellation/test_sglang.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@
2121

2222
logger = logging.getLogger(__name__)
2323

24+
pytestmark = [
25+
pytest.mark.sglang,
26+
pytest.mark.e2e,
27+
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
28+
pytest.mark.nightly,
29+
]
30+
2431

2532
class DynamoWorkerProcess(ManagedProcess):
2633
"""Process manager for Dynamo worker with SGLang backend"""
@@ -146,11 +153,8 @@ def is_ready(self, response) -> bool:
146153
return False
147154

148155

149-
@pytest.mark.e2e
150-
@pytest.mark.sglang
156+
@pytest.mark.timeout(160) # 3x average
151157
@pytest.mark.gpu_1
152-
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
153-
@pytest.mark.nightly
154158
@pytest.mark.xfail(strict=False)
155159
def test_request_cancellation_sglang_aggregated(
156160
request, runtime_services, predownload_models
@@ -236,11 +240,8 @@ def test_request_cancellation_sglang_aggregated(
236240
logger.info(f"{description} detected successfully")
237241

238242

239-
@pytest.mark.e2e
240-
@pytest.mark.sglang
243+
@pytest.mark.timeout(185) # 3x average
241244
@pytest.mark.gpu_2
242-
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
243-
@pytest.mark.nightly
244245
def test_request_cancellation_sglang_decode_cancel(
245246
request, runtime_services, predownload_models
246247
):

tests/fault_tolerance/cancellation/test_trtllm.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
pytest.mark.gpu_1,
2727
pytest.mark.e2e,
2828
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
29+
pytest.mark.nightly,
2930
]
3031

3132

@@ -134,7 +135,7 @@ def is_ready(self, response) -> bool:
134135
return False
135136

136137

137-
@pytest.mark.nightly
138+
@pytest.mark.timeout(140) # 3x average
138139
def test_request_cancellation_trtllm_aggregated(
139140
request, runtime_services, predownload_models
140141
):
@@ -208,7 +209,7 @@ def test_request_cancellation_trtllm_aggregated(
208209
logger.info(f"{description} detected successfully")
209210

210211

211-
@pytest.mark.nightly
212+
@pytest.mark.timeout(350) # 3x average
212213
def test_request_cancellation_trtllm_decode_cancel(
213214
request, runtime_services, predownload_models
214215
):
@@ -281,7 +282,7 @@ def test_request_cancellation_trtllm_decode_cancel(
281282
)
282283

283284

284-
@pytest.mark.nightly
285+
@pytest.mark.timeout(350) # 3x average
285286
def test_request_cancellation_trtllm_prefill_cancel(
286287
request, runtime_services, predownload_models
287288
):
@@ -364,6 +365,7 @@ def test_request_cancellation_trtllm_prefill_cancel(
364365
)
365366

366367

368+
@pytest.mark.timeout(350) # 3x average
367369
@pytest.mark.xfail(
368370
reason="May fail due to unknown reason with TRT-LLM or backend implementation",
369371
strict=False,

tests/fault_tolerance/cancellation/test_vllm.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@
2020

2121
logger = logging.getLogger(__name__)
2222

23+
pytestmark = [
24+
pytest.mark.vllm,
25+
pytest.mark.gpu_1,
26+
pytest.mark.e2e,
27+
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
28+
pytest.mark.nightly,
29+
]
30+
2331

2432
class DynamoWorkerProcess(ManagedProcess):
2533
"""Process manager for Dynamo worker with vLLM backend"""
@@ -120,11 +128,7 @@ def is_ready(self, response) -> bool:
120128
return False
121129

122130

123-
@pytest.mark.vllm
124-
@pytest.mark.gpu_1
125-
@pytest.mark.e2e
126-
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
127-
@pytest.mark.nightly
131+
@pytest.mark.timeout(110) # 3x average
128132
def test_request_cancellation_vllm_aggregated(
129133
request, runtime_services, predownload_models
130134
):
@@ -198,11 +202,7 @@ def test_request_cancellation_vllm_aggregated(
198202
logger.info(f"{description} detected successfully")
199203

200204

201-
@pytest.mark.vllm
202-
@pytest.mark.gpu_1
203-
@pytest.mark.e2e
204-
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
205-
@pytest.mark.nightly
205+
@pytest.mark.timeout(150) # 3x average
206206
def test_request_cancellation_vllm_decode_cancel(
207207
request, runtime_services, predownload_models, set_ucx_tls_no_mm
208208
):
@@ -272,11 +272,7 @@ def test_request_cancellation_vllm_decode_cancel(
272272
)
273273

274274

275-
@pytest.mark.vllm
276-
@pytest.mark.gpu_1
277-
@pytest.mark.e2e
278-
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
279-
@pytest.mark.nightly
275+
@pytest.mark.timeout(150) # 3x average
280276
def test_request_cancellation_vllm_prefill_cancel(
281277
request, runtime_services, predownload_models, set_ucx_tls_no_mm
282278
):

tests/fault_tolerance/migration/test_sglang.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ def is_ready(self, response) -> bool:
108108
return False
109109

110110

111+
@pytest.mark.timeout(235) # 3x average
112+
@pytest.mark.xfail(
113+
reason="For some reason both replicas received the request where only one should",
114+
strict=False,
115+
)
111116
def test_request_migration_sglang_worker_failure(
112117
request, runtime_services, predownload_models, set_ucx_tls_no_mm
113118
):
@@ -199,6 +204,11 @@ def test_request_migration_sglang_graceful_shutdown(
199204
verify_migration_occurred(frontend)
200205

201206

207+
@pytest.mark.timeout(135) # 3x average
208+
@pytest.mark.xfail(
209+
reason="For some reason both replicas received the request where only one should",
210+
strict=False,
211+
)
202212
def test_no_request_migration_sglang_worker_failure(
203213
request, runtime_services, predownload_models, set_ucx_tls_no_mm
204214
):

tests/fault_tolerance/migration/test_trtllm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ def is_ready(self, response) -> bool:
104104
return False
105105

106106

107+
@pytest.mark.timeout(290) # 3x average
108+
@pytest.mark.xfail(
109+
reason="For some reason both replicas received the request where only one should",
110+
strict=False,
111+
)
107112
def test_request_migration_trtllm_worker_failure(
108113
request, runtime_services, predownload_models, set_ucx_tls_no_mm
109114
):
@@ -195,6 +200,11 @@ def test_request_migration_trtllm_graceful_shutdown(
195200
verify_migration_occurred(frontend)
196201

197202

203+
@pytest.mark.timeout(185) # 3x average
204+
@pytest.mark.xfail(
205+
reason="For some reason both replicas received the request where only one should",
206+
strict=False,
207+
)
198208
def test_no_request_migration_trtllm_worker_failure(
199209
request, runtime_services, predownload_models, set_ucx_tls_no_mm
200210
):

tests/fault_tolerance/migration/test_vllm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def is_ready(self, response) -> bool:
108108
return False
109109

110110

111+
@pytest.mark.timeout(290) # 3x average
111112
def test_request_migration_vllm_worker_failure(
112113
request, runtime_services, predownload_models, set_ucx_tls_no_mm
113114
):
@@ -151,6 +152,7 @@ def test_request_migration_vllm_worker_failure(
151152
verify_migration_occurred(frontend)
152153

153154

155+
@pytest.mark.timeout(280) # 3x average
154156
def test_request_migration_vllm_graceful_shutdown(
155157
request, runtime_services, predownload_models, set_ucx_tls_no_mm
156158
):
@@ -198,6 +200,7 @@ def test_request_migration_vllm_graceful_shutdown(
198200
verify_migration_occurred(frontend)
199201

200202

203+
@pytest.mark.timeout(150) # 3x average
201204
def test_no_request_migration_vllm_worker_failure(
202205
request, runtime_services, predownload_models, set_ucx_tls_no_mm
203206
):
@@ -257,6 +260,7 @@ def test_no_request_migration_vllm_worker_failure(
257260
), f"Unexpected migration message: {e}"
258261

259262

263+
@pytest.mark.timeout(140) # 3x average
260264
def test_no_request_migration_vllm_graceful_shutdown(
261265
request, runtime_services, predownload_models, set_ucx_tls_no_mm
262266
):

0 commit comments

Comments
 (0)