deepgram: support for Flux (#3245)

davidzhao · web-flow · commit ed3ec2f2d23d · 2025-09-30T21:38:31.000-07:00
diff --git a/livekit-agents/livekit/agents/llm/llm.py b/livekit-agents/livekit/agents/llm/llm.py
@@ -264,7 +264,8 @@ async def _metrics_monitor_task(self, event_aiter: AsyncIterable[ChatChunk]) ->
 
         duration = time.perf_counter() - start_time
 
-        if self._current_attempt_has_error:
+        # if generation is aborted before any tokens are received, it doesn't make sense to report -1 ttft
+        if self._current_attempt_has_error or ttft < 0:
             return
 
         metrics = LLMMetrics(
diff --git a/livekit-agents/livekit/agents/stt/stt.py b/livekit-agents/livekit/agents/stt/stt.py
@@ -29,6 +29,11 @@ class SpeechEventType(str, Enum):
     if the STT doesn't support this event, this will be emitted as the same time as the first INTERIM_TRANSCRIPT"""  # noqa: E501
     INTERIM_TRANSCRIPT = "interim_transcript"
     """interim transcript, useful for real-time transcription"""
+    PREFLIGHT_TRANSCRIPT = "preflight_transcript"
+    """preflight transcript, emitted when the STT is confident enough that a certain
+    portion of speech will not change. This is different from final transcript in that
+    the same transcript may still be updated; but it is stable enough to be used for
+    preemptive generation"""
     FINAL_TRANSCRIPT = "final_transcript"
     """final transcript, emitted when the STT is confident enough that a certain
     portion of speech will not change"""
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -1141,18 +1141,21 @@ def _interrupt_by_audio_activity(self) -> None:
 
     # region recognition hooks
 
-    def on_start_of_speech(self, ev: vad.VADEvent) -> None:
+    def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
         self._session._update_user_state("speaking")
 
         if self._false_interruption_timer:
             # cancel the timer when user starts speaking but leave the paused state unchanged
             self._false_interruption_timer.cancel()
             self._false_interruption_timer = None
 
-    def on_end_of_speech(self, ev: vad.VADEvent) -> None:
+    def on_end_of_speech(self, ev: vad.VADEvent | None) -> None:
+        speech_end_time = time.time()
+        if ev:
+            speech_end_time = speech_end_time - ev.silence_duration
         self._session._update_user_state(
             "listening",
-            last_speaking_time=time.time() - ev.silence_duration,
+            last_speaking_time=speech_end_time,
         )
 
         if (
diff --git a/livekit-agents/livekit/agents/voice/audio_recognition.py b/livekit-agents/livekit/agents/voice/audio_recognition.py
@@ -61,9 +61,9 @@ async def predict_end_of_turn(
 
 
 class RecognitionHooks(Protocol):
-    def on_start_of_speech(self, ev: vad.VADEvent) -> None: ...
+    def on_start_of_speech(self, ev: vad.VADEvent | None) -> None: ...
     def on_vad_inference_done(self, ev: vad.VADEvent) -> None: ...
-    def on_end_of_speech(self, ev: vad.VADEvent) -> None: ...
+    def on_end_of_speech(self, ev: vad.VADEvent | None) -> None: ...
     def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -> None: ...
     def on_final_transcript(self, ev: stt.SpeechEvent) -> None: ...
     def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: ...
@@ -103,10 +103,13 @@ def __init__(
         self._speaking = False
         self._last_speaking_time: float = 0
         self._last_final_transcript_time: float = 0
+        # used for manual commit_user_turn
         self._final_transcript_received = asyncio.Event()
         self._final_transcript_confidence: list[float] = []
         self._audio_transcript = ""
         self._audio_interim_transcript = ""
+        # used for STTs that support preflight mode, so it could start preemptive generation earlier
+        self._audio_preflight_transcript = ""
         self._last_language: str | None = None
 
         self._stt_ch: aio.Chan[rtc.AudioFrame] | None = None
@@ -191,6 +194,7 @@ def update_vad(self, vad: vad.VAD | None) -> None:
     def clear_user_turn(self) -> None:
         self._audio_transcript = ""
         self._audio_interim_transcript = ""
+        self._audio_preflight_transcript = ""
         self._final_transcript_confidence = []
         self._user_turn_committed = False
 
@@ -317,7 +321,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
             self._audio_transcript += f" {transcript}"
             self._audio_transcript = self._audio_transcript.lstrip()
             self._final_transcript_confidence.append(confidence)
+            transcript_changed = self._audio_transcript != self._audio_preflight_transcript
             self._audio_interim_transcript = ""
+            self._audio_preflight_transcript = ""
             self._final_transcript_received.set()
 
             if not self._vad or self._last_speaking_time == 0:
@@ -328,7 +334,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                 # and using that timestamp for _last_speaking_time
                 self._last_speaking_time = time.time()
 
-            if self._vad_base_turn_detection or self._user_turn_committed:
+            if transcript_changed and (self._vad_base_turn_detection or self._user_turn_committed):
                 self._hooks.on_preemptive_generation(
                     _PreemptiveGenerationInfo(
                         new_transcript=self._audio_transcript,
@@ -341,20 +347,72 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                     )
                 )
 
-                if not self._speaking:
-                    chat_ctx = self._hooks.retrieve_chat_ctx().copy()
-                    self._run_eou_detection(chat_ctx)
+            if not self._speaking:
+                chat_ctx = self._hooks.retrieve_chat_ctx().copy()
+                self._run_eou_detection(chat_ctx)
+
+        elif ev.type == stt.SpeechEventType.PREFLIGHT_TRANSCRIPT:
+            self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None)
+            transcript = ev.alternatives[0].text
+            language = ev.alternatives[0].language
+            confidence = ev.alternatives[0].confidence
+
+            if not self._last_language or (
+                language and len(transcript) > MIN_LANGUAGE_DETECTION_LENGTH
+            ):
+                self._last_language = language
+
+            if not transcript:
+                return
+
+            logger.debug(
+                "received user preflight transcript",
+                extra={"user_transcript": transcript, "language": self._last_language},
+            )
+
+            # still need to increment it as it's used for turn detection,
+            self._last_final_transcript_time = time.time()
+            # preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
+            self._audio_preflight_transcript = (self._audio_transcript + " " + transcript).lstrip()
+            self._audio_interim_transcript = transcript
+
+            if not self._vad or self._last_speaking_time == 0:
+                # vad disabled, use stt timestamp
+                self._last_speaking_time = time.time()
+
+            if self._turn_detection_mode != "manual" or self._user_turn_committed:
+                confidence_vals = list(self._final_transcript_confidence) + [confidence]
+                self._hooks.on_preemptive_generation(
+                    _PreemptiveGenerationInfo(
+                        new_transcript=self._audio_preflight_transcript,
+                        transcript_confidence=sum(confidence_vals) / len(confidence_vals),
+                    )
+                )
 
         elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
             self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None)
             self._audio_interim_transcript = ev.alternatives[0].text
 
         elif ev.type == stt.SpeechEventType.END_OF_SPEECH and self._turn_detection_mode == "stt":
+            with trace.use_span(self._ensure_user_turn_span()):
+                self._hooks.on_end_of_speech(None)
+
+            self._speaking = False
             self._user_turn_committed = True
-            if not self._speaking:
-                # start response after vad fires END_OF_SPEECH to avoid vad interruption
-                chat_ctx = self._hooks.retrieve_chat_ctx().copy()
-                self._run_eou_detection(chat_ctx)
+            self._last_speaking_time = time.time()
+
+            chat_ctx = self._hooks.retrieve_chat_ctx().copy()
+            self._run_eou_detection(chat_ctx)
+
+        elif ev.type == stt.SpeechEventType.START_OF_SPEECH and self._turn_detection_mode == "stt":
+            with trace.use_span(self._ensure_user_turn_span()):
+                self._hooks.on_start_of_speech(None)
+
+            self._speaking = True
+            self._last_speaking_time = time.time()
+
+            if self._end_of_turn_task is not None:
+                self._end_of_turn_task.cancel()
 
     async def _on_vad_event(self, ev: vad.VADEvent) -> None:
         if ev.type == vad.VADEventType.START_OF_SPEECH:
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py
@@ -20,10 +20,11 @@
 """
 
 from .stt import STT, SpeechStream
+from .stt_v2 import SpeechStreamv2, STTv2
 from .tts import TTS
 from .version import __version__
 
-__all__ = ["STT", "SpeechStream", "__version__", "TTS"]
+__all__ = ["STT", "SpeechStream", "STTv2", "SpeechStreamv2", "__version__", "TTS"]
 
 
 from livekit.agents import Plugin
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py
@@ -35,6 +35,8 @@
     "whisper-large",
 ]
 
+V2Models = Literal["flux-general-en"]
+
 DeepgramLanguages = Literal[
     "zh",
     "zh-CN",
diff --git a/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py b/livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/stt_v2.py

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@`
`35`	`35`	`"whisper-large",`
`36`	`36`	`]`
`37`	`37`
	`38`	`+V2Models = Literal["flux-general-en"]`
	`39`	`+`
`38`	`40`	`DeepgramLanguages = Literal[`
`39`	`41`	`"zh",`
`40`	`42`	`"zh-CN",`