1010from tests .router .common import ( # utilities
1111 _test_router_basic ,
1212 _test_router_decisions ,
13+ _test_router_indexers_sync ,
1314 generate_random_suffix ,
1415 get_runtime ,
1516)
2021MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
2122
2223pytestmark = [
23- pytest .mark .pre_merge ,
2424 pytest .mark .e2e ,
2525 pytest .mark .vllm ,
2626 pytest .mark .model (MODEL_NAME ),
4646 "max_tokens" : 10 ,
4747}
4848
49+ # Shared vLLM configuration for all tests
50+ # gpu_memory_utilization limits actual VRAM allocation (required for multi-worker on same GPU)
51+ VLLM_ARGS : Dict [str , Any ] = {
52+ "block_size" : BLOCK_SIZE ,
53+ "model" : MODEL_NAME ,
54+ "gpu_memory_utilization" : 0.4 , # Limit VRAM allocation per worker
55+ "max_model_len" : 1024 , # Limit context length to reduce KV cache size
56+ "enforce_eager" : True , # Disable CUDA graphs for faster startup & lower memory
57+ }
58+
4959
5060class VLLMProcess :
5161 """Manages vLLM workers using dynamo.vllm (HTTP API + KV events).
@@ -72,11 +82,12 @@ def __init__(
7282 vllm_args: Configuration dict with keys:
7383 - block_size: KV cache block size (default: 16)
7484 - model: Model name/path (default: TinyLlama-1.1B)
75- - gpu_memory_utilization: GPU memory fraction per worker (default: 0.9)
85+ - gpu_memory_utilization: Fraction of GPU memory to allocate (optional)
86+ - num_gpu_blocks_override: Cap on number of KV cache blocks (optional)
7687 - max_model_len: Maximum sequence length (optional)
77- - speedup_ratio: IGNORED (vLLM runs at real speed )
88+ - enforce_eager: Disable CUDA graphs (default: False )
7889 num_workers: Number of vLLM worker processes
79- single_gpu: If True, all workers share GPU 0 (requires gpu_memory_utilization < 1.0/num_workers)
90+ single_gpu: If True, all workers share GPU 0
8091 data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size)
8192 """
8293 # Generate unique namespace for isolation
@@ -92,8 +103,10 @@ def __init__(
92103
93104 block_size = vllm_args .get ("block_size" , BLOCK_SIZE )
94105 model = vllm_args .get ("model" , MODEL_NAME )
95- gpu_memory_utilization = vllm_args .get ("gpu_memory_utilization" , 0.9 )
106+ gpu_memory_utilization = vllm_args .get ("gpu_memory_utilization" )
107+ num_gpu_blocks_override = vllm_args .get ("num_gpu_blocks_override" )
96108 max_model_len = vllm_args .get ("max_model_len" )
109+ enforce_eager = vllm_args .get ("enforce_eager" , False )
97110
98111 self .model_name = model
99112
@@ -130,15 +143,28 @@ def __init__(
130143 model ,
131144 "--block-size" ,
132145 str (block_size ),
133- "--enforce-eager" , # Disable CUDA graphs for faster startup
134- "--gpu-memory-utilization" ,
135- str (gpu_memory_utilization ),
136146 ]
137147
148+ # Disable CUDA graphs for faster startup & lower memory
149+ if enforce_eager :
150+ command .append ("--enforce-eager" )
151+
152+ # Limit VRAM allocation (required for multi-worker on same GPU)
153+ if gpu_memory_utilization is not None :
154+ command .extend (
155+ ["--gpu-memory-utilization" , str (gpu_memory_utilization )]
156+ )
157+
138158 # Add optional max_model_len if specified
139159 if max_model_len is not None :
140160 command .extend (["--max-model-len" , str (max_model_len )])
141161
162+ # Cap block count for predictable KV cache behavior
163+ if num_gpu_blocks_override is not None :
164+ command .extend (
165+ ["--num-gpu-blocks-override" , str (num_gpu_blocks_override )]
166+ )
167+
142168 if data_parallel_size is not None :
143169 # Add DP configuration for external load balancing
144170 # See: https://docs.vllm.ai/en/v0.10.0/serving/data_parallel_deployment.html#external-load-balancing
@@ -157,6 +183,8 @@ def __init__(
157183 {
158184 "CUDA_VISIBLE_DEVICES" : gpu_device ,
159185 "DYN_NAMESPACE" : self .namespace ,
186+ "DYN_VLLM_KV_EVENT_PORT" : str (20080 + worker_idx ),
187+ "VLLM_NIXL_SIDE_CHANNEL_PORT" : str (20090 + worker_idx ),
160188 "PYTHONHASHSEED" : "0" , # for deterministic event id's
161189 }
162190 )
@@ -176,13 +204,13 @@ def __init__(
176204 if data_parallel_size is not None :
177205 logger .info (
178206 f"Created { data_parallel_size } DP ranks per worker on GPU(s) { gpu_device } "
179- f"(gpu_memory_utilization ={ gpu_memory_utilization } ) "
207+ f"(gpu_mem ={ gpu_memory_utilization } ) "
180208 f"with endpoint: { self .endpoint } "
181209 )
182210 else :
183211 logger .info (
184212 f"Created vLLM worker { worker_idx } on GPU { gpu_device } "
185- f"(gpu_memory_utilization ={ gpu_memory_utilization } ) "
213+ f"(gpu_mem ={ gpu_memory_utilization } ) "
186214 f"with endpoint: { self .endpoint } "
187215 )
188216
@@ -276,9 +304,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
276304 time .sleep (2 )
277305
278306
307+ @pytest .mark .pre_merge
279308@pytest .mark .gpu_1
280- @pytest .mark .skip (reason = "All vLLM tests disabled for now" )
281- def test_vllm_kv_router_basic (request , runtime_services , predownload_tokenizers ):
309+ def test_vllm_kv_router_basic (
310+ request , runtime_services , predownload_models , set_ucx_tls_no_mm
311+ ):
282312 """
283313 Quick e2e sanity test for KV router with vLLM engine instances.
284314 """
@@ -287,19 +317,12 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
287317 N_VLLM_WORKERS = 2
288318 logger .info (f"Starting vLLM KV router test with { N_VLLM_WORKERS } workers" )
289319
290- vllm_args = {
291- "block_size" : BLOCK_SIZE ,
292- "model" : MODEL_NAME ,
293- "gpu_memory_utilization" : 0.35 ,
294- "max_model_len" : 1024 , # Limit context length to reduce KV cache size
295- }
296-
297320 try :
298321 # Start vLLM workers
299322 logger .info (f"Starting { N_VLLM_WORKERS } vLLM workers" )
300323 vllm_workers = VLLMProcess (
301324 request ,
302- vllm_args = vllm_args ,
325+ vllm_args = VLLM_ARGS ,
303326 num_workers = N_VLLM_WORKERS ,
304327 single_gpu = True , # fit workers into one GPU
305328 )
@@ -323,32 +346,22 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
323346 vllm_workers .__exit__ (None , None , None )
324347
325348
349+ @pytest .mark .pre_merge
326350@pytest .mark .gpu_1
327- @pytest .mark .skip (reason = "All vLLM tests disabled for now" )
328351def test_router_decisions_vllm_multiple_workers (
329- request , runtime_services , predownload_tokenizers
352+ request , runtime_services , predownload_models , set_ucx_tls_no_mm
330353):
331354 # runtime_services starts etcd and nats
332355 logger .info ("Starting vLLM router prefix reuse test with two workers" )
333-
334- # Create vLLM args - one worker with dp_size=2, sharing GPU 0
335- vllm_args = {
336- "block_size" : BLOCK_SIZE ,
337- "model" : MODEL_NAME ,
338- "gpu_memory_utilization" : 0.35 ,
339- "max_model_len" : 1024 , # Limit context length to reduce KV cache size
340- }
341356 N_WORKERS = 2
342357
343358 try :
344- # Start 2 worker processes (dp_rank 0 and dp_rank 1) on the same GPU
345- logger .info (
346- "Starting 2 vLLM worker processes with dp_size=2 on single GPU (gpu_memory_utilization=0.35, max_model_len=1024)"
347- )
359+ # Start 2 worker processes on the same GPU
360+ logger .info ("Starting 2 vLLM worker processes on single GPU (gpu_mem=0.4)" )
348361 vllm_workers = VLLMProcess (
349362 request ,
350- vllm_args = vllm_args ,
351- num_workers = N_WORKERS , # One worker process with dp_size=2
363+ vllm_args = VLLM_ARGS ,
364+ num_workers = N_WORKERS ,
352365 single_gpu = True , # Worker uses GPU 0
353366 )
354367 logger .info (f"All vLLM workers using namespace: { vllm_workers .namespace } " )
@@ -373,32 +386,24 @@ def test_router_decisions_vllm_multiple_workers(
373386
374387
375388@pytest .mark .gpu_2
376- @pytest .mark .skip (reason = "All vLLM tests disabled for now" )
377- def test_router_decisions_vllm_dp (request , runtime_services , predownload_tokenizers ):
389+ def test_router_decisions_vllm_dp (
390+ request , runtime_services , predownload_models , set_ucx_tls_no_mm
391+ ):
378392 """Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
379393 Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).
380394 Dump events from router and verify:
381395 * All but one (worker_id, dp_rank) should have no events (due to prefix reuse)
382396 * The (worker_id, dp_rank) with events should have exactly 4 events (one per request)
383397 * All events should be on the forced (worker_id, dp_rank=1) (verifying forced routing and prefix reuse)
384398 """
385- # Create vLLM args - one worker with dp_size=2, sharing GPU 0
386- vllm_args = {
387- "block_size" : BLOCK_SIZE ,
388- "model" : MODEL_NAME ,
389- "gpu_memory_utilization" : 0.35 ,
390- "max_model_len" : 1024 , # Limit context length to reduce KV cache size
391- }
392399 N_WORKERS = 1
393400 DP_SIZE = 2
394401
395402 try :
396- logger .info (
397- "Starting 2 vLLM DP ranks (dp_size=2) on single GPU (gpu_memory_utilization=0.35, max_model_len=1024)"
398- )
403+ logger .info ("Starting 2 vLLM DP ranks (dp_size=2) (gpu_mem=0.4)" )
399404 vllm_workers = VLLMProcess (
400405 request ,
401- vllm_args = vllm_args ,
406+ vllm_args = VLLM_ARGS ,
402407 num_workers = N_WORKERS , # Ignored when data_parallel_size is set
403408 single_gpu = False ,
404409 data_parallel_size = DP_SIZE , # Creates DP_SIZE processes (one per rank)
@@ -421,3 +426,44 @@ def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokeniz
421426 # Clean up vLLM workers
422427 if "vllm_workers" in locals ():
423428 vllm_workers .__exit__ (None , None , None )
429+
430+
431+ @pytest .mark .pre_merge
432+ @pytest .mark .gpu_1
433+ def test_vllm_indexers_sync (
434+ request , runtime_services , predownload_models , set_ucx_tls_no_mm
435+ ):
436+ """
437+ Test that two KV routers have synchronized indexer states after processing requests
438+ with vLLM workers. This test verifies that both routers converge to the same internal state.
439+ """
440+ logger .info ("Starting vLLM indexers sync test" )
441+ N_VLLM_WORKERS = 2
442+
443+ try :
444+ # Start vLLM workers
445+ logger .info (f"Starting { N_VLLM_WORKERS } vLLM workers" )
446+ vllm_workers = VLLMProcess (
447+ request ,
448+ vllm_args = VLLM_ARGS ,
449+ num_workers = N_VLLM_WORKERS ,
450+ single_gpu = True , # fit workers into one GPU
451+ )
452+ logger .info (f"All vLLM workers using namespace: { vllm_workers .namespace } " )
453+ vllm_workers .__enter__ ()
454+
455+ # Use the common test implementation (creates its own runtimes for each router)
456+ # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
457+ _test_router_indexers_sync (
458+ engine_workers = vllm_workers ,
459+ block_size = BLOCK_SIZE ,
460+ model_name = MODEL_NAME ,
461+ num_workers = N_VLLM_WORKERS ,
462+ store_backend = "etcd" ,
463+ )
464+
465+ logger .info ("vLLM indexers sync test completed successfully" )
466+
467+ finally :
468+ if "vllm_workers" in locals ():
469+ vllm_workers .__exit__ (None , None , None )
0 commit comments