Skip to content

Commit d5f425a

Browse files
authored
chore(pipeline): Move migration outside of backend (#4823)
Signed-off-by: Graham King <[email protected]>
1 parent 7c15166 commit d5f425a

File tree

2 files changed

+29
-33
lines changed

2 files changed

+29
-33
lines changed

lib/llm/src/entrypoint/input/common.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,13 +271,13 @@ where
271271
// Link with prefill chooser including backward edge for response flow
272272
let engine = frontend
273273
.link(preprocessor_op.forward_edge())?
274-
.link(backend.forward_edge())?
275274
.link(migration.forward_edge())?
275+
.link(backend.forward_edge())?
276276
.link(prefill_op.forward_edge())?
277277
.link(service_backend)?
278278
.link(prefill_op.backward_edge())?
279-
.link(migration.backward_edge())?
280279
.link(backend.backward_edge())?
280+
.link(migration.backward_edge())?
281281
.link(preprocessor_op.backward_edge())?
282282
.link(frontend)?;
283283

lib/llm/src/migration.rs

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ use async_nats::client::{
1111
};
1212

1313
use crate::{
14-
model_card::ModelDeploymentCard,
15-
protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
14+
model_card::ModelDeploymentCard, preprocessor::BackendOutput,
15+
protocols::common::llm_backend::PreprocessedRequest,
1616
};
1717

1818
use dynamo_runtime::{
@@ -44,16 +44,16 @@ impl Migration {
4444
impl
4545
Operator<
4646
SingleIn<PreprocessedRequest>,
47-
ManyOut<Annotated<LLMEngineOutput>>,
47+
ManyOut<Annotated<BackendOutput>>,
4848
SingleIn<PreprocessedRequest>,
49-
ManyOut<Annotated<LLMEngineOutput>>,
49+
ManyOut<Annotated<BackendOutput>>,
5050
> for Migration
5151
{
5252
async fn generate(
5353
&self,
5454
request: SingleIn<PreprocessedRequest>,
55-
next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
56-
) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
55+
next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
56+
) -> Result<ManyOut<Annotated<BackendOutput>>> {
5757
let (preprocessed_request, context) = request.transfer(());
5858
let engine_ctx = context.context();
5959
let engine_ctx_ = engine_ctx.clone();
@@ -73,16 +73,16 @@ impl
7373
struct RetryManager {
7474
context: Arc<dyn AsyncEngineContext>,
7575
request: PreprocessedRequest,
76-
next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
77-
next_stream: Option<ManyOut<Annotated<LLMEngineOutput>>>,
76+
next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
77+
next_stream: Option<ManyOut<Annotated<BackendOutput>>>,
7878
retries_left: u32,
7979
}
8080

8181
impl RetryManager {
8282
pub async fn build(
8383
context: Arc<dyn AsyncEngineContext>,
8484
preprocessed_request: PreprocessedRequest,
85-
next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
85+
next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
8686
retries_left: u32,
8787
) -> Result<Self> {
8888
let mut slf = Self {
@@ -96,7 +96,7 @@ impl RetryManager {
9696
Ok(slf)
9797
}
9898

99-
pub async fn next(&mut self) -> Option<Annotated<LLMEngineOutput>> {
99+
pub async fn next(&mut self) -> Option<Annotated<BackendOutput>> {
100100
loop {
101101
let response_stream = match self.next_stream.as_mut() {
102102
Some(stream) => stream,
@@ -128,7 +128,7 @@ impl RetryManager {
128128
}
129129

130130
async fn new_stream(&mut self) -> Result<()> {
131-
let mut response_stream: Option<Result<ManyOut<Annotated<LLMEngineOutput>>>> = None;
131+
let mut response_stream: Option<Result<ManyOut<Annotated<BackendOutput>>>> = None;
132132
while self.retries_left > 0 {
133133
self.retries_left -= 1;
134134
let request = Context::with_id(self.request.clone(), self.context.id().to_string());
@@ -162,7 +162,7 @@ impl RetryManager {
162162
}
163163
}
164164

165-
fn track_response(&mut self, response: &Annotated<LLMEngineOutput>) {
165+
fn track_response(&mut self, response: &Annotated<BackendOutput>) {
166166
if self.retries_left == 0 {
167167
return;
168168
}
@@ -207,18 +207,17 @@ mod tests {
207207
}
208208

209209
// Helper to create mock LLM engine output
210-
fn create_mock_output(token_id: u32) -> Annotated<LLMEngineOutput> {
211-
Annotated::from_data(LLMEngineOutput {
210+
fn create_mock_output(token_id: u32) -> Annotated<BackendOutput> {
211+
Annotated::from_data(BackendOutput {
212212
token_ids: vec![token_id],
213-
tokens: None,
214-
text: Some(format!("token_{}", token_id)),
213+
tokens: vec![],
214+
text: Some(format!("token_{token_id}")),
215215
cum_log_probs: None,
216216
log_probs: None,
217217
top_logprobs: None,
218218
finish_reason: None,
219219
index: None,
220220
disaggregated_params: None,
221-
extra_args: None,
222221
completion_usage: None,
223222
})
224223
}
@@ -267,16 +266,13 @@ mod tests {
267266

268267
#[async_trait]
269268
impl
270-
AsyncEngine<
271-
SingleIn<PreprocessedRequest>,
272-
ManyOut<Annotated<LLMEngineOutput>>,
273-
anyhow::Error,
274-
> for MockEngine
269+
AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<BackendOutput>>, anyhow::Error>
270+
for MockEngine
275271
{
276272
async fn generate(
277273
&self,
278274
request: SingleIn<PreprocessedRequest>,
279-
) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
275+
) -> Result<ManyOut<Annotated<BackendOutput>>> {
280276
let call_num = self.call_count.fetch_add(1, Ordering::SeqCst);
281277
let (preprocessed_request, context) = request.transfer(());
282278

@@ -457,7 +453,7 @@ mod tests {
457453
&self,
458454
start: usize,
459455
end: usize,
460-
) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
456+
) -> Result<ManyOut<Annotated<BackendOutput>>> {
461457
let (tx, rx) = mpsc::channel(1);
462458
let token_offset = self.token_offset;
463459

@@ -494,7 +490,7 @@ mod tests {
494490
100,
495491
context_id.clone(),
496492
));
497-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
493+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
498494
mock_engine;
499495

500496
let ctx = Arc::new(Controller::new(context_id.clone()));
@@ -533,7 +529,7 @@ mod tests {
533529
100,
534530
context_id.clone(),
535531
));
536-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
532+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
537533
mock_engine;
538534

539535
let ctx = Arc::new(Controller::new(context_id.clone()));
@@ -573,7 +569,7 @@ mod tests {
573569
100,
574570
context_id.clone(),
575571
));
576-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
572+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
577573
mock_engine;
578574

579575
let ctx = Arc::new(Controller::new(context_id.clone()));
@@ -613,7 +609,7 @@ mod tests {
613609
100,
614610
context_id.clone(),
615611
));
616-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
612+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
617613
mock_engine;
618614

619615
// Should fail to build due to initial stream creation failure after exhausting all 3 retries
@@ -641,7 +637,7 @@ mod tests {
641637
100,
642638
context_id.clone(),
643639
));
644-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
640+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
645641
mock_engine;
646642

647643
let ctx = Arc::new(Controller::new(context_id.clone()));
@@ -690,7 +686,7 @@ mod tests {
690686
100,
691687
context_id.clone(),
692688
));
693-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
689+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
694690
mock_engine;
695691

696692
let ctx = Arc::new(Controller::new(context_id.clone()));
@@ -739,7 +735,7 @@ mod tests {
739735
100,
740736
context_id.clone(),
741737
));
742-
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>> =
738+
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
743739
mock_engine;
744740

745741
let ctx = Arc::new(Controller::new(context_id.clone()));

0 commit comments

Comments
 (0)