Skip to content

Commit 32eaecb

Browse files
authored
fix: Set max_tokens_in_buffer on TRT-LLM cache transceiver (#4703)
Signed-off-by: Jacky <[email protected]>
1 parent 59e6873 commit 32eaecb

File tree

1 file changed

+14
-16
lines changed

1 file changed

+14
-16
lines changed

tests/fault_tolerance/cancellation/test_trtllm.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
"""
5-
Test Execution Times (Last Run: 2025-12-09):
5+
Test Execution Times (Last Run: 2025-12-13):
66
- test_request_cancellation_trtllm_aggregated: ~45s (gpu_1)
7-
- test_request_cancellation_trtllm_decode_cancel: ~115s (gpu_1)
8-
- test_request_cancellation_trtllm_prefill_cancel: ~115s (gpu_1)
9-
- test_request_cancellation_trtllm_kv_transfer_cancel: ~115s (gpu_1, xfail)
10-
- Total: ~390s (0:06:30)
7+
- test_request_cancellation_trtllm_decode_cancel: ~65s (gpu_1)
8+
- test_request_cancellation_trtllm_prefill_cancel: ~65s (gpu_1)
9+
- test_request_cancellation_trtllm_kv_transfer_cancel: ~65s (gpu_1)
10+
- Total: ~240s x2 request planes = ~480s (0:08:00)
1111
"""
1212

1313
import logging
@@ -72,8 +72,6 @@ def __init__(
7272
FAULT_TOLERANCE_MODEL_NAME,
7373
"--disaggregation-mode",
7474
mode,
75-
"--free-gpu-memory-fraction",
76-
"0.45",
7775
"--max-seq-len",
7876
"16384",
7977
"--max-num-tokens",
@@ -83,8 +81,11 @@ def __init__(
8381
]
8482
if mode != "prefill_and_decode":
8583
with open("test_request_cancellation_trtllm_config.yaml", "w") as f:
86-
f.write("cache_transceiver_config:\n backend: DEFAULT\n")
84+
f.write(
85+
"cache_transceiver_config:\n backend: DEFAULT\n max_tokens_in_buffer: 16384\n"
86+
)
8787
f.write("disable_overlap_scheduler: true\n")
88+
f.write("kv_cache_config:\n max_tokens: 16384\n")
8889
command += [
8990
"--extra-engine-args",
9091
"test_request_cancellation_trtllm_config.yaml",
@@ -164,7 +165,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
164165
return super().__exit__(exc_type, exc_val, exc_tb)
165166

166167

167-
@pytest.mark.timeout(140) # 3x average
168+
@pytest.mark.timeout(135) # 3x average
168169
def test_request_cancellation_trtllm_aggregated(
169170
request, runtime_services_dynamic_ports, predownload_models
170171
):
@@ -251,7 +252,7 @@ def test_request_cancellation_trtllm_aggregated(
251252
logger.info(f"{description} detected successfully")
252253

253254

254-
@pytest.mark.timeout(350) # 3x average
255+
@pytest.mark.timeout(195) # 3x average
255256
def test_request_cancellation_trtllm_decode_cancel(
256257
request, runtime_services_dynamic_ports, predownload_models
257258
):
@@ -335,7 +336,7 @@ def test_request_cancellation_trtllm_decode_cancel(
335336
)
336337

337338

338-
@pytest.mark.timeout(350) # 3x average
339+
@pytest.mark.timeout(195) # 3x average
339340
def test_request_cancellation_trtllm_prefill_cancel(
340341
request, runtime_services_dynamic_ports, predownload_models
341342
):
@@ -427,11 +428,8 @@ def test_request_cancellation_trtllm_prefill_cancel(
427428
)
428429

429430

430-
@pytest.mark.timeout(350) # 3x average
431-
@pytest.mark.xfail(
432-
reason="May fail due to unknown reason with TRT-LLM or backend implementation",
433-
strict=False,
434-
)
431+
@pytest.mark.xfail(reason="Test fails only on CI", strict=False)
432+
@pytest.mark.timeout(195) # 3x average
435433
def test_request_cancellation_trtllm_kv_transfer_cancel(
436434
request, runtime_services_dynamic_ports, predownload_models
437435
):

0 commit comments

Comments
 (0)