tmp: Return to setting free gpu mem fraction

kthui · kthui · commit c57ef9388422 · 2025-12-15T16:27:53.000-08:00
Signed-off-by: Jacky &lt;18255193+kthui@users.noreply.github.com&gt;
diff --git a/tests/fault_tolerance/cancellation/test_trtllm.py b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -72,6 +72,8 @@ def __init__(
             FAULT_TOLERANCE_MODEL_NAME,
             "--disaggregation-mode",
             mode,
+            "--free-gpu-memory-fraction",
+            "0.2",
             "--max-seq-len",
             "16384",
             "--max-num-tokens",
@@ -85,7 +87,7 @@ def __init__(
                     "cache_transceiver_config:\n  backend: DEFAULT\n  max_tokens_in_buffer: 16384\n"
                 )
                 f.write("disable_overlap_scheduler: true\n")
-                f.write("kv_cache_config:\n  max_tokens: 16384\n")
+                # f.write("kv_cache_config:\n  max_tokens: 16384\n")
             command += [
                 "--extra-engine-args",
                 "test_request_cancellation_trtllm_config.yaml",