Skip to content

Commit 8c37f8e

Browse files
committed
fix: Set max_tokens_in_buffer on TRT-LLM cache transceiver
Signed-off-by: Jacky <[email protected]>
1 parent 9fb5f03 commit 8c37f8e

File tree

1 file changed

+3
-5
lines changed

1 file changed

+3
-5
lines changed

tests/fault_tolerance/cancellation/test_trtllm.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def __init__(self, request, mode: str = "prefill_and_decode"):
5555
]
5656
if mode != "prefill_and_decode":
5757
with open("test_request_cancellation_trtllm_config.yaml", "w") as f:
58-
f.write("cache_transceiver_config:\n backend: DEFAULT\n")
58+
f.write(
59+
"cache_transceiver_config:\n backend: DEFAULT\n max_tokens_in_buffer: 16384\n"
60+
)
5961
f.write("disable_overlap_scheduler: true\n")
6062
command += [
6163
"--extra-engine-args",
@@ -373,10 +375,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
373375
@pytest.mark.gpu_1
374376
@pytest.mark.e2e
375377
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
376-
@pytest.mark.xfail(
377-
reason="May fail due to unknown reason with TRT-LLM or backend implementation",
378-
strict=False,
379-
)
380378
def test_request_cancellation_trtllm_kv_transfer_cancel(
381379
request, runtime_services, predownload_models
382380
):

0 commit comments

Comments
 (0)