Skip to content

Commit a473402

Browse files
authored
feat: fault tolerance rolling upgrade test scenarios (#4558)
1 parent 5585f80 commit a473402

File tree

8 files changed

+706
-214
lines changed

8 files changed

+706
-214
lines changed

tests/fault_tolerance/deploy/client.py

Lines changed: 108 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
import json
1919
import logging
2020
import os
21+
import signal
2122
import subprocess
2223
import time
2324
from pathlib import Path
2425
from typing import Any, Dict, List, Optional, Tuple
2526

2627
import requests
28+
from kr8s.objects import Pod
2729

2830
from tests.utils.managed_deployment import ManagedDeployment
2931

@@ -44,7 +46,7 @@ def get_frontend_port(
4446
deployment_spec: Any,
4547
pod_ports: Dict[str, Any],
4648
logger: logging.Logger,
47-
) -> Tuple[Optional[str], Optional[int], Optional[str]]:
49+
) -> Tuple[Optional[str], Optional[int], Optional[Pod]]:
4850
"""
4951
Select a frontend pod using round-robin and setup port forwarding.
5052
@@ -60,7 +62,7 @@ def get_frontend_port(
6062
Returns:
6163
Tuple of (pod_name, local_port, pod_instance) or (None, None, None) if failed
6264
"""
63-
pods = managed_deployment.get_pods(managed_deployment.frontend_service_name)
65+
pods = managed_deployment.get_pods([managed_deployment.frontend_service_name])
6466

6567
port = 0
6668
pod_name = None
@@ -270,6 +272,7 @@ def run_aiperf(
270272
logger: logging.Logger,
271273
max_retries: int = 1,
272274
retry_delay: float = 1,
275+
continuous_load: bool = False,
273276
) -> bool:
274277
"""
275278
Execute AI-Perf with specified parameters.
@@ -280,13 +283,14 @@ def run_aiperf(
280283
model: Model name
281284
pod_name: Selected pod name for logging
282285
port: Local port number
283-
requests_per_client: Number of requests to send
286+
requests_per_client: Number of requests to send (used if continuous load not enabled)
284287
input_token_length: Input token count
285288
output_token_length: Output token count
286289
output_dir: Directory for AI-Perf artifacts
287290
logger: Logger instance
288291
max_retries: Maximum number of retry attempts (default: 1)
289292
retry_delay: Delay in seconds between retries (default: 1)
293+
continuous_load: If True, use continuous load instead of fixed request count
290294
291295
Returns:
292296
True if successful, False otherwise
@@ -315,8 +319,6 @@ def run_aiperf(
315319
# Enable streaming for TTFT and ITL metrics
316320
"--streaming",
317321
# Request parameters
318-
"--request-count",
319-
str(requests_per_client), # Required: how many requests
320322
"--concurrency",
321323
"1", # Optional: we set to 1 for sequential
322324
# Token configuration
@@ -338,8 +340,13 @@ def run_aiperf(
338340
"100", # For reproducible results
339341
]
340342

341-
# Calculate timeout (same as legacy would for all requests)
342-
timeout = max(requests_per_client * 2 + 60, 300) # At least 5 minutes
343+
if continuous_load:
344+
cmd.extend(["--benchmark-duration", "1800"]) # 30 minutes for continuous load
345+
logger.info("Using continuous load with duration: 30 minutes")
346+
timeout = 1860 # 31 minutes default for duration-based tests (30 minutes + 1 minute buffer)
347+
else:
348+
cmd.extend(["--request-count", str(requests_per_client)])
349+
timeout = max(requests_per_client * 2 + 60, 300) # At least 5 minutes
343350

344351
# Log execution
345352
logger.info(f"Starting AI-Perf for Pod {pod_name} Local Port {port}")
@@ -354,15 +361,19 @@ def run_aiperf(
354361
logger.info(f"Command: {' '.join(cmd)}")
355362

356363
# Retry logic for fault tolerance - retry FULL request count until success
357-
358-
max_attempts = max_retries if max_retries > 0 else 1
364+
# Note: For continuous load, we only run once and expect SIGINT to stop it
365+
max_attempts = 1 if continuous_load else (max_retries if max_retries > 0 else 1)
359366
success = False
360-
all_results = []
361367

362368
for attempt in range(max_attempts):
363-
logger.info(
364-
f"AI-Perf attempt {attempt + 1}/{max_attempts} with {requests_per_client} requests"
365-
)
369+
if continuous_load:
370+
logger.info(
371+
"AI-Perf continuous load (will run until interrupted by SIGINT)"
372+
)
373+
else:
374+
logger.info(
375+
f"AI-Perf attempt {attempt + 1}/{max_attempts} with {requests_per_client} requests"
376+
)
366377

367378
# Update output directory for this attempt
368379
attempt_dir = output_dir / f"attempt_{attempt}"
@@ -374,13 +385,7 @@ def run_aiperf(
374385
cmd_attempt[artifact_dir_idx] = str(attempt_dir)
375386

376387
try:
377-
result = subprocess.run(
378-
cmd_attempt,
379-
capture_output=True,
380-
text=True,
381-
timeout=timeout,
382-
stdin=subprocess.DEVNULL, # Prevent stdin reading which can cause process suspension
383-
)
388+
result = run_aiperf_with_signal_handling(cmd_attempt, logger, timeout)
384389

385390
# Save logs for this attempt
386391
with open(attempt_dir / "genai_perf.log", "w") as f:
@@ -389,15 +394,6 @@ def run_aiperf(
389394
f.write("\n\n=== STDERR ===\n")
390395
f.write(result.stderr)
391396

392-
all_results.append(
393-
{
394-
"attempt": attempt + 1,
395-
"returncode": result.returncode,
396-
"stdout": result.stdout,
397-
"stderr": result.stderr,
398-
}
399-
)
400-
401397
if result.returncode == 0:
402398
# AI-Perf returns 0 even if all requests failed, so we need to check the output
403399
json_path = attempt_dir / "profile_export_aiperf.json"
@@ -412,6 +408,19 @@ def run_aiperf(
412408
)
413409
if success:
414410
break # Success - exit the retry loop
411+
## TODO: bug with aiperf git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
412+
## where sending a SIGINT on Mac can sometimes have an error code of -9 (SIGABRT) which results in profile_export_aiperf.json not being created
413+
elif result.returncode == -9 and continuous_load:
414+
logger.warning(
415+
f"""
416+
Attempt {attempt + 1} failed with return code {result.returncode}
417+
This is a known bug with aiperf on Mac where sending a SIGINT can sometimes have an error code of -9 (SIGABRT)
418+
which results in profile_export_aiperf.json not being created
419+
"""
420+
)
421+
logger.debug(
422+
f"Stderr: {result.stderr[:500] if result.stderr else 'No stderr'}"
423+
)
415424
else:
416425
logger.warning(
417426
f"Attempt {attempt + 1} failed with return code {result.returncode}"
@@ -421,22 +430,84 @@ def run_aiperf(
421430
)
422431
except Exception as e:
423432
logger.error(f"Error in attempt {attempt + 1}: {str(e)}")
424-
all_results.append({"attempt": attempt + 1, "error": str(e)})
425433

426-
# Sleep before next attempt (if not the last attempt)
427-
if not success and attempt < max_attempts - 1:
434+
# Sleep before next attempt (if not the last attempt and not continuous load)
435+
if not success and attempt < max_attempts - 1 and not continuous_load:
428436
time.sleep(retry_delay)
429437

430-
if success:
438+
if success and not continuous_load:
431439
logger.info(
432440
f"AI-Perf successfully completed all {requests_per_client} requests for {pod_name}"
433441
)
442+
elif success and continuous_load:
443+
logger.info(
444+
f"AI-Perf sustained continuous load for {pod_name} and existed succesfully"
445+
)
434446
else:
435447
logger.error(f"AI-Perf failed all {max_attempts} attempts for {pod_name}")
436448

437449
return success
438450

439451

452+
# TODO: use file redirection and wait() instead of pipes and communicate
453+
def run_aiperf_with_signal_handling(
454+
cmd_attempt: List[str],
455+
logger: logging.Logger,
456+
timeout: int,
457+
) -> subprocess.CompletedProcess:
458+
"""
459+
Run aiperf with signal handling for graceful shutdown.
460+
461+
Handles SIGINT and SIGTERM forwarding and timeout when running with subprocess.Popen.
462+
This ensures that Ctrl-C (SIGINT) and graceful termination signals (SIGTERM)
463+
are properly forwarded to the subprocess so it can clean up gracefully and write results files.
464+
"""
465+
proc = subprocess.Popen(
466+
cmd_attempt,
467+
stdout=subprocess.PIPE,
468+
stderr=subprocess.PIPE,
469+
text=True,
470+
stdin=subprocess.DEVNULL,
471+
)
472+
473+
def signal_handler(signum, frame):
474+
signal_names = {
475+
signal.SIGINT: "SIGINT",
476+
signal.SIGTERM: "SIGTERM",
477+
}
478+
signal_name = signal_names.get(signum, f"signal {signum}")
479+
logger.info(f"Received {signal_name}, forwarding to aiperf subprocess")
480+
try:
481+
proc.send_signal(signum)
482+
except ProcessLookupError:
483+
pass # Process already terminated
484+
485+
signal.signal(signal.SIGINT, signal_handler)
486+
signal.signal(signal.SIGTERM, signal_handler)
487+
488+
try:
489+
stdout, stderr = proc.communicate(timeout=timeout)
490+
returncode = proc.returncode
491+
except subprocess.TimeoutExpired:
492+
logger.warning(f"AI-Perf subprocess timed out after {timeout}s")
493+
proc.kill()
494+
stdout, stderr = proc.communicate()
495+
returncode = proc.returncode
496+
except KeyboardInterrupt:
497+
logger.info("Received KeyboardInterrupt, sending SIGINT to aiperf subprocess")
498+
proc.send_signal(signal.SIGINT)
499+
try:
500+
stdout, stderr = proc.communicate(timeout=30) # Give it time to clean up
501+
returncode = proc.returncode
502+
except subprocess.TimeoutExpired:
503+
logger.warning("Subprocess didn't terminate gracefully, killing it")
504+
proc.kill()
505+
stdout, stderr = proc.communicate()
506+
returncode = proc.returncode
507+
508+
return subprocess.CompletedProcess(cmd_attempt, returncode, stdout, stderr)
509+
510+
440511
def log_summary_metrics(
441512
output_dir: Path, logger: logging.Logger, pod_name: str, port: int
442513
) -> None:
@@ -513,6 +584,7 @@ def client(
513584
output_token_length: int,
514585
max_retries: int,
515586
retry_delay: float = 1,
587+
continuous_load: bool = False,
516588
):
517589
"""
518590
Generate load using AI-Perf for fault tolerance testing.
@@ -527,11 +599,12 @@ def client(
527599
model: Model name
528600
log_dir: Directory for output logs and AI-Perf artifacts
529601
index: Client index used for round-robin pod selection
530-
requests_per_client: Number of requests to generate
602+
requests_per_client: Number of requests to generate (used if continuous load not enabled)
531603
input_token_length: Number of input tokens per request
532604
output_token_length: Number of output tokens per request
533605
max_retries: Maximum retry attempts for AI-Perf execution
534606
retry_delay: Delay in seconds between retry attempts
607+
continuous_load: If True, use continuous load instead of fixed request count
535608
"""
536609
logger = logging.getLogger(f"CLIENT: {index}")
537610
logging.getLogger("httpx").setLevel(logging.WARNING)
@@ -578,6 +651,7 @@ def client(
578651
logger=logger,
579652
max_retries=max_retries,
580653
retry_delay=retry_delay,
654+
continuous_load=continuous_load,
581655
)
582656

583657
if not success:

tests/fault_tolerance/deploy/client_factory.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def get_client_function(client_type: str) -> Callable:
4242
output_token_length,
4343
max_retries,
4444
retry_delay_or_rate, # Differs between implementations
45+
continuous_load,
4546
)
4647
4748
Raises:

tests/fault_tolerance/deploy/conftest.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ def pytest_addoption(parser):
3535
help="Include tests that require custom builds (e.g., MoE models). "
3636
"By default, these tests are excluded.",
3737
)
38+
parser.addoption(
39+
"--skip-service-restart",
40+
action="store_true",
41+
default=False,
42+
help="Skip restarting NATS and etcd services before deployment. "
43+
"By default, these services are restarted.",
44+
)
3845

3946

4047
def pytest_generate_tests(metafunc):
@@ -109,3 +116,9 @@ def namespace(request):
109116
def client_type(request):
110117
"""Get client type from command line or use scenario default."""
111118
return request.config.getoption("--client-type")
119+
120+
121+
@pytest.fixture
122+
def skip_service_restart(request):
123+
"""Get skip restart services flag from command line."""
124+
return request.config.getoption("--skip-service-restart")

tests/fault_tolerance/deploy/legacy_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ def client(
192192
max_retries,
193193
max_request_rate,
194194
retry_delay=1,
195+
continuous_load=False,
195196
):
196197
"""Legacy custom client for fault tolerance testing.
197198
@@ -211,7 +212,11 @@ def client(
211212
max_retries: Maximum retry attempts per request
212213
max_request_rate: Maximum requests per second (for rate limiting)
213214
retry_delay: Delay in seconds between retries
215+
continuous_load: If True, use continuous load instead of fixed request count
214216
"""
217+
if continuous_load:
218+
raise ValueError("Continuous load is not supported for legacy client")
219+
215220
logger = logging.getLogger(f"CLIENT: {index}")
216221
logging.getLogger("httpx").setLevel(logging.WARNING)
217222

@@ -228,7 +233,7 @@ def client(
228233
for i in range(requests_per_client):
229234
# Get available pods
230235
pods = managed_deployment.get_pods(
231-
managed_deployment.frontend_service_name
236+
[managed_deployment.frontend_service_name]
232237
)
233238
port = 0
234239
pod_name = None

0 commit comments

Comments
 (0)