Skip to content

Commit 67273ab

Browse files
authored
test: reenable router + vllm tests (#4746)
Signed-off-by: PeaBrane <[email protected]>
1 parent d580031 commit 67273ab

File tree

2 files changed

+103
-50
lines changed

2 files changed

+103
-50
lines changed

tests/router/test_router_e2e_with_mockers.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
pytest.mark.pre_merge,
3131
pytest.mark.gpu_0,
3232
pytest.mark.integration,
33-
pytest.mark.parallel,
3433
pytest.mark.model(MODEL_NAME),
3534
]
3635
NUM_MOCKERS = 2
@@ -287,6 +286,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
287286
self._process.__exit__(exc_type, exc_val, exc_tb)
288287

289288

289+
@pytest.mark.parallel
290290
def test_mocker_kv_router(request, runtime_services_session, predownload_tokenizers):
291291
"""
292292
Test KV router with multiple mocker engine instances.
@@ -326,6 +326,7 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
326326
mockers.__exit__(None, None, None)
327327

328328

329+
@pytest.mark.parallel
329330
@pytest.mark.parametrize("store_backend", ["etcd", "file"])
330331
def test_mocker_two_kv_router(
331332
request,
@@ -381,6 +382,7 @@ def test_mocker_two_kv_router(
381382
mockers.__exit__(None, None, None)
382383

383384

385+
@pytest.mark.parallel
384386
@pytest.mark.skip(reason="Flaky, temporarily disabled")
385387
def test_mocker_kv_router_overload_503(
386388
request, runtime_services_session, predownload_tokenizers
@@ -419,6 +421,7 @@ def test_mocker_kv_router_overload_503(
419421
mockers.__exit__(None, None, None)
420422

421423

424+
@pytest.mark.parallel
422425
def test_kv_push_router_bindings(
423426
request, runtime_services_session, predownload_tokenizers
424427
):
@@ -504,6 +507,7 @@ def test_indexers_sync(
504507
mockers.__exit__(None, None, None)
505508

506509

510+
@pytest.mark.parallel
507511
def test_query_instance_id_returns_worker_and_tokens(
508512
request, runtime_services_session, predownload_tokenizers
509513
):
@@ -538,6 +542,7 @@ def test_query_instance_id_returns_worker_and_tokens(
538542
mockers.__exit__(None, None, None)
539543

540544

545+
@pytest.mark.parallel
541546
def test_router_decisions(request, runtime_services_session, predownload_tokenizers):
542547
"""Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes."""
543548

@@ -577,6 +582,7 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz
577582
mockers.__exit__(None, None, None)
578583

579584

585+
@pytest.mark.parallel
580586
def test_router_disagg_decisions(
581587
request, runtime_services_session, predownload_tokenizers
582588
):
@@ -642,6 +648,7 @@ def test_router_disagg_decisions(
642648
prefill_workers.__exit__(None, None, None)
643649

644650

651+
@pytest.mark.parallel
645652
def test_busy_threshold_endpoint(
646653
request, runtime_services_session, predownload_tokenizers
647654
):

tests/router/test_router_e2e_with_vllm.py

Lines changed: 95 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from tests.router.common import ( # utilities
1111
_test_router_basic,
1212
_test_router_decisions,
13+
_test_router_indexers_sync,
1314
generate_random_suffix,
1415
get_runtime,
1516
)
@@ -20,7 +21,6 @@
2021
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
2122

2223
pytestmark = [
23-
pytest.mark.pre_merge,
2424
pytest.mark.e2e,
2525
pytest.mark.vllm,
2626
pytest.mark.model(MODEL_NAME),
@@ -46,6 +46,16 @@
4646
"max_tokens": 10,
4747
}
4848

49+
# Shared vLLM configuration for all tests
50+
# gpu_memory_utilization limits actual VRAM allocation (required for multi-worker on same GPU)
51+
VLLM_ARGS: Dict[str, Any] = {
52+
"block_size": BLOCK_SIZE,
53+
"model": MODEL_NAME,
54+
"gpu_memory_utilization": 0.4, # Limit VRAM allocation per worker
55+
"max_model_len": 1024, # Limit context length to reduce KV cache size
56+
"enforce_eager": True, # Disable CUDA graphs for faster startup & lower memory
57+
}
58+
4959

5060
class VLLMProcess:
5161
"""Manages vLLM workers using dynamo.vllm (HTTP API + KV events).
@@ -72,11 +82,12 @@ def __init__(
7282
vllm_args: Configuration dict with keys:
7383
- block_size: KV cache block size (default: 16)
7484
- model: Model name/path (default: TinyLlama-1.1B)
75-
- gpu_memory_utilization: GPU memory fraction per worker (default: 0.9)
85+
- gpu_memory_utilization: Fraction of GPU memory to allocate (optional)
86+
- num_gpu_blocks_override: Cap on number of KV cache blocks (optional)
7687
- max_model_len: Maximum sequence length (optional)
77-
- speedup_ratio: IGNORED (vLLM runs at real speed)
88+
- enforce_eager: Disable CUDA graphs (default: False)
7889
num_workers: Number of vLLM worker processes
79-
single_gpu: If True, all workers share GPU 0 (requires gpu_memory_utilization < 1.0/num_workers)
90+
single_gpu: If True, all workers share GPU 0
8091
data_parallel_size: If set, enables data parallelism with this many ranks (num_workers must equal data_parallel_size)
8192
"""
8293
# Generate unique namespace for isolation
@@ -92,8 +103,10 @@ def __init__(
92103

93104
block_size = vllm_args.get("block_size", BLOCK_SIZE)
94105
model = vllm_args.get("model", MODEL_NAME)
95-
gpu_memory_utilization = vllm_args.get("gpu_memory_utilization", 0.9)
106+
gpu_memory_utilization = vllm_args.get("gpu_memory_utilization")
107+
num_gpu_blocks_override = vllm_args.get("num_gpu_blocks_override")
96108
max_model_len = vllm_args.get("max_model_len")
109+
enforce_eager = vllm_args.get("enforce_eager", False)
97110

98111
self.model_name = model
99112

@@ -130,15 +143,28 @@ def __init__(
130143
model,
131144
"--block-size",
132145
str(block_size),
133-
"--enforce-eager", # Disable CUDA graphs for faster startup
134-
"--gpu-memory-utilization",
135-
str(gpu_memory_utilization),
136146
]
137147

148+
# Disable CUDA graphs for faster startup & lower memory
149+
if enforce_eager:
150+
command.append("--enforce-eager")
151+
152+
# Limit VRAM allocation (required for multi-worker on same GPU)
153+
if gpu_memory_utilization is not None:
154+
command.extend(
155+
["--gpu-memory-utilization", str(gpu_memory_utilization)]
156+
)
157+
138158
# Add optional max_model_len if specified
139159
if max_model_len is not None:
140160
command.extend(["--max-model-len", str(max_model_len)])
141161

162+
# Cap block count for predictable KV cache behavior
163+
if num_gpu_blocks_override is not None:
164+
command.extend(
165+
["--num-gpu-blocks-override", str(num_gpu_blocks_override)]
166+
)
167+
142168
if data_parallel_size is not None:
143169
# Add DP configuration for external load balancing
144170
# See: https://docs.vllm.ai/en/v0.10.0/serving/data_parallel_deployment.html#external-load-balancing
@@ -157,6 +183,8 @@ def __init__(
157183
{
158184
"CUDA_VISIBLE_DEVICES": gpu_device,
159185
"DYN_NAMESPACE": self.namespace,
186+
"DYN_VLLM_KV_EVENT_PORT": str(20080 + worker_idx),
187+
"VLLM_NIXL_SIDE_CHANNEL_PORT": str(20090 + worker_idx),
160188
"PYTHONHASHSEED": "0", # for deterministic event id's
161189
}
162190
)
@@ -176,13 +204,13 @@ def __init__(
176204
if data_parallel_size is not None:
177205
logger.info(
178206
f"Created {data_parallel_size} DP ranks per worker on GPU(s) {gpu_device} "
179-
f"(gpu_memory_utilization={gpu_memory_utilization}) "
207+
f"(gpu_mem={gpu_memory_utilization}) "
180208
f"with endpoint: {self.endpoint}"
181209
)
182210
else:
183211
logger.info(
184212
f"Created vLLM worker {worker_idx} on GPU {gpu_device} "
185-
f"(gpu_memory_utilization={gpu_memory_utilization}) "
213+
f"(gpu_mem={gpu_memory_utilization}) "
186214
f"with endpoint: {self.endpoint}"
187215
)
188216

@@ -276,9 +304,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
276304
time.sleep(2)
277305

278306

307+
@pytest.mark.pre_merge
279308
@pytest.mark.gpu_1
280-
@pytest.mark.skip(reason="All vLLM tests disabled for now")
281-
def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers):
309+
def test_vllm_kv_router_basic(
310+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
311+
):
282312
"""
283313
Quick e2e sanity test for KV router with vLLM engine instances.
284314
"""
@@ -287,19 +317,12 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
287317
N_VLLM_WORKERS = 2
288318
logger.info(f"Starting vLLM KV router test with {N_VLLM_WORKERS} workers")
289319

290-
vllm_args = {
291-
"block_size": BLOCK_SIZE,
292-
"model": MODEL_NAME,
293-
"gpu_memory_utilization": 0.35,
294-
"max_model_len": 1024, # Limit context length to reduce KV cache size
295-
}
296-
297320
try:
298321
# Start vLLM workers
299322
logger.info(f"Starting {N_VLLM_WORKERS} vLLM workers")
300323
vllm_workers = VLLMProcess(
301324
request,
302-
vllm_args=vllm_args,
325+
vllm_args=VLLM_ARGS,
303326
num_workers=N_VLLM_WORKERS,
304327
single_gpu=True, # fit workers into one GPU
305328
)
@@ -323,32 +346,22 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
323346
vllm_workers.__exit__(None, None, None)
324347

325348

349+
@pytest.mark.pre_merge
326350
@pytest.mark.gpu_1
327-
@pytest.mark.skip(reason="All vLLM tests disabled for now")
328351
def test_router_decisions_vllm_multiple_workers(
329-
request, runtime_services, predownload_tokenizers
352+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
330353
):
331354
# runtime_services starts etcd and nats
332355
logger.info("Starting vLLM router prefix reuse test with two workers")
333-
334-
# Create vLLM args - one worker with dp_size=2, sharing GPU 0
335-
vllm_args = {
336-
"block_size": BLOCK_SIZE,
337-
"model": MODEL_NAME,
338-
"gpu_memory_utilization": 0.35,
339-
"max_model_len": 1024, # Limit context length to reduce KV cache size
340-
}
341356
N_WORKERS = 2
342357

343358
try:
344-
# Start 2 worker processes (dp_rank 0 and dp_rank 1) on the same GPU
345-
logger.info(
346-
"Starting 2 vLLM worker processes with dp_size=2 on single GPU (gpu_memory_utilization=0.35, max_model_len=1024)"
347-
)
359+
# Start 2 worker processes on the same GPU
360+
logger.info("Starting 2 vLLM worker processes on single GPU (gpu_mem=0.4)")
348361
vllm_workers = VLLMProcess(
349362
request,
350-
vllm_args=vllm_args,
351-
num_workers=N_WORKERS, # One worker process with dp_size=2
363+
vllm_args=VLLM_ARGS,
364+
num_workers=N_WORKERS,
352365
single_gpu=True, # Worker uses GPU 0
353366
)
354367
logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
@@ -373,32 +386,24 @@ def test_router_decisions_vllm_multiple_workers(
373386

374387

375388
@pytest.mark.gpu_2
376-
@pytest.mark.skip(reason="All vLLM tests disabled for now")
377-
def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokenizers):
389+
def test_router_decisions_vllm_dp(
390+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
391+
):
378392
"""Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
379393
Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).
380394
Dump events from router and verify:
381395
* All but one (worker_id, dp_rank) should have no events (due to prefix reuse)
382396
* The (worker_id, dp_rank) with events should have exactly 4 events (one per request)
383397
* All events should be on the forced (worker_id, dp_rank=1) (verifying forced routing and prefix reuse)
384398
"""
385-
# Create vLLM args - one worker with dp_size=2, sharing GPU 0
386-
vllm_args = {
387-
"block_size": BLOCK_SIZE,
388-
"model": MODEL_NAME,
389-
"gpu_memory_utilization": 0.35,
390-
"max_model_len": 1024, # Limit context length to reduce KV cache size
391-
}
392399
N_WORKERS = 1
393400
DP_SIZE = 2
394401

395402
try:
396-
logger.info(
397-
"Starting 2 vLLM DP ranks (dp_size=2) on single GPU (gpu_memory_utilization=0.35, max_model_len=1024)"
398-
)
403+
logger.info("Starting 2 vLLM DP ranks (dp_size=2) (gpu_mem=0.4)")
399404
vllm_workers = VLLMProcess(
400405
request,
401-
vllm_args=vllm_args,
406+
vllm_args=VLLM_ARGS,
402407
num_workers=N_WORKERS, # Ignored when data_parallel_size is set
403408
single_gpu=False,
404409
data_parallel_size=DP_SIZE, # Creates DP_SIZE processes (one per rank)
@@ -421,3 +426,44 @@ def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokeniz
421426
# Clean up vLLM workers
422427
if "vllm_workers" in locals():
423428
vllm_workers.__exit__(None, None, None)
429+
430+
431+
@pytest.mark.pre_merge
432+
@pytest.mark.gpu_1
433+
def test_vllm_indexers_sync(
434+
request, runtime_services, predownload_models, set_ucx_tls_no_mm
435+
):
436+
"""
437+
Test that two KV routers have synchronized indexer states after processing requests
438+
with vLLM workers. This test verifies that both routers converge to the same internal state.
439+
"""
440+
logger.info("Starting vLLM indexers sync test")
441+
N_VLLM_WORKERS = 2
442+
443+
try:
444+
# Start vLLM workers
445+
logger.info(f"Starting {N_VLLM_WORKERS} vLLM workers")
446+
vllm_workers = VLLMProcess(
447+
request,
448+
vllm_args=VLLM_ARGS,
449+
num_workers=N_VLLM_WORKERS,
450+
single_gpu=True, # fit workers into one GPU
451+
)
452+
logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
453+
vllm_workers.__enter__()
454+
455+
# Use the common test implementation (creates its own runtimes for each router)
456+
# Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
457+
_test_router_indexers_sync(
458+
engine_workers=vllm_workers,
459+
block_size=BLOCK_SIZE,
460+
model_name=MODEL_NAME,
461+
num_workers=N_VLLM_WORKERS,
462+
store_backend="etcd",
463+
)
464+
465+
logger.info("vLLM indexers sync test completed successfully")
466+
467+
finally:
468+
if "vllm_workers" in locals():
469+
vllm_workers.__exit__(None, None, None)

0 commit comments

Comments
 (0)