fix: Set max_tokens_in_buffer on TRT-LLM cache transceiver

kthui · kthui · commit 8c37f8e48340 · 2025-12-02T12:29:20.000-08:00
Signed-off-by: Jacky &lt;18255193+kthui@users.noreply.github.com&gt;
diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -55,7 +55,9 @@ def __init__(self, request, mode: str = "prefill_and_decode"):
         ]
         if mode != "prefill_and_decode":
             with open("test_request_cancellation_trtllm_config.yaml", "w") as f:
-                f.write("cache_transceiver_config:\n  backend: DEFAULT\n")
+                f.write(
+                    "cache_transceiver_config:\n  backend: DEFAULT\n  max_tokens_in_buffer: 16384\n"
+                )
                 f.write("disable_overlap_scheduler: true\n")
             command += [
                 "--extra-engine-args",
@@ -373,10 +375,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-@pytest.mark.xfail(
-    reason="May fail due to unknown reason with TRT-LLM or backend implementation",
-    strict=False,
-)
 def test_request_cancellation_trtllm_kv_transfer_cancel(
     request, runtime_services, predownload_models
 ):