comments

tzulingk · tzulingk · commit 8bb65dd70e66 · 2025-12-08T21:03:32.000-08:00
Signed-off-by: tzulingk@nvidia.com &lt;tzulingk@nvidia.com&gt;
diff --git a/lib/llm/src/http/service/openai.rs b/lib/llm/src/http/service/openai.rs
@@ -935,11 +935,16 @@ async fn chat_completions(
     // note - we might do this as part of the post processing set to make it more generic
 
     if streaming {
+        // For streaming responses, we return HTTP 200 immediately without checking for errors.
+        // Once HTTP 200 OK is sent, we cannot change the status code, so any backend errors
+        // must be delivered as SSE events with `event: error` in the stream (handled by
+        // EventConverter and monitor_for_disconnects). This is standard SSE behavior.
         stream_handle.arm(); // allows the system to detect client disconnects and cancel the LLM generation
 
         let mut http_queue_guard = Some(http_queue_guard);
         let stream = stream.map(move |response| {
             // Calls observe_response() on each token
+            // EventConverter will detect `event: "error"` and convert to SSE error events
             process_response_using_event_converter_and_observe_metrics(
                 EventConverter::from(response),
                 &mut response_collector,