Skip to content

Commit ed3ec2f

Browse files
authored
deepgram: support for Flux (#3245)
1 parent 808d091 commit ed3ec2f

File tree

7 files changed

+597
-15
lines changed

7 files changed

+597
-15
lines changed

livekit-agents/livekit/agents/llm/llm.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,8 @@ async def _metrics_monitor_task(self, event_aiter: AsyncIterable[ChatChunk]) ->
264264

265265
duration = time.perf_counter() - start_time
266266

267-
if self._current_attempt_has_error:
267+
# if generation is aborted before any tokens are received, it doesn't make sense to report -1 ttft
268+
if self._current_attempt_has_error or ttft < 0:
268269
return
269270

270271
metrics = LLMMetrics(

livekit-agents/livekit/agents/stt/stt.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ class SpeechEventType(str, Enum):
2929
if the STT doesn't support this event, this will be emitted as the same time as the first INTERIM_TRANSCRIPT""" # noqa: E501
3030
INTERIM_TRANSCRIPT = "interim_transcript"
3131
"""interim transcript, useful for real-time transcription"""
32+
PREFLIGHT_TRANSCRIPT = "preflight_transcript"
33+
"""preflight transcript, emitted when the STT is confident enough that a certain
34+
portion of speech will not change. This is different from final transcript in that
35+
the same transcript may still be updated; but it is stable enough to be used for
36+
preemptive generation"""
3237
FINAL_TRANSCRIPT = "final_transcript"
3338
"""final transcript, emitted when the STT is confident enough that a certain
3439
portion of speech will not change"""

livekit-agents/livekit/agents/voice/agent_activity.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,18 +1141,21 @@ def _interrupt_by_audio_activity(self) -> None:
11411141

11421142
# region recognition hooks
11431143

1144-
def on_start_of_speech(self, ev: vad.VADEvent) -> None:
1144+
def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
11451145
self._session._update_user_state("speaking")
11461146

11471147
if self._false_interruption_timer:
11481148
# cancel the timer when user starts speaking but leave the paused state unchanged
11491149
self._false_interruption_timer.cancel()
11501150
self._false_interruption_timer = None
11511151

1152-
def on_end_of_speech(self, ev: vad.VADEvent) -> None:
1152+
def on_end_of_speech(self, ev: vad.VADEvent | None) -> None:
1153+
speech_end_time = time.time()
1154+
if ev:
1155+
speech_end_time = speech_end_time - ev.silence_duration
11531156
self._session._update_user_state(
11541157
"listening",
1155-
last_speaking_time=time.time() - ev.silence_duration,
1158+
last_speaking_time=speech_end_time,
11561159
)
11571160

11581161
if (

livekit-agents/livekit/agents/voice/audio_recognition.py

Lines changed: 68 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ async def predict_end_of_turn(
6161

6262

6363
class RecognitionHooks(Protocol):
64-
def on_start_of_speech(self, ev: vad.VADEvent) -> None: ...
64+
def on_start_of_speech(self, ev: vad.VADEvent | None) -> None: ...
6565
def on_vad_inference_done(self, ev: vad.VADEvent) -> None: ...
66-
def on_end_of_speech(self, ev: vad.VADEvent) -> None: ...
66+
def on_end_of_speech(self, ev: vad.VADEvent | None) -> None: ...
6767
def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -> None: ...
6868
def on_final_transcript(self, ev: stt.SpeechEvent) -> None: ...
6969
def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: ...
@@ -103,10 +103,13 @@ def __init__(
103103
self._speaking = False
104104
self._last_speaking_time: float = 0
105105
self._last_final_transcript_time: float = 0
106+
# used for manual commit_user_turn
106107
self._final_transcript_received = asyncio.Event()
107108
self._final_transcript_confidence: list[float] = []
108109
self._audio_transcript = ""
109110
self._audio_interim_transcript = ""
111+
# used for STTs that support preflight mode, so it could start preemptive generation earlier
112+
self._audio_preflight_transcript = ""
110113
self._last_language: str | None = None
111114

112115
self._stt_ch: aio.Chan[rtc.AudioFrame] | None = None
@@ -191,6 +194,7 @@ def update_vad(self, vad: vad.VAD | None) -> None:
191194
def clear_user_turn(self) -> None:
192195
self._audio_transcript = ""
193196
self._audio_interim_transcript = ""
197+
self._audio_preflight_transcript = ""
194198
self._final_transcript_confidence = []
195199
self._user_turn_committed = False
196200

@@ -317,7 +321,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
317321
self._audio_transcript += f" {transcript}"
318322
self._audio_transcript = self._audio_transcript.lstrip()
319323
self._final_transcript_confidence.append(confidence)
324+
transcript_changed = self._audio_transcript != self._audio_preflight_transcript
320325
self._audio_interim_transcript = ""
326+
self._audio_preflight_transcript = ""
321327
self._final_transcript_received.set()
322328

323329
if not self._vad or self._last_speaking_time == 0:
@@ -328,7 +334,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
328334
# and using that timestamp for _last_speaking_time
329335
self._last_speaking_time = time.time()
330336

331-
if self._vad_base_turn_detection or self._user_turn_committed:
337+
if transcript_changed and (self._vad_base_turn_detection or self._user_turn_committed):
332338
self._hooks.on_preemptive_generation(
333339
_PreemptiveGenerationInfo(
334340
new_transcript=self._audio_transcript,
@@ -341,20 +347,72 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
341347
)
342348
)
343349

344-
if not self._speaking:
345-
chat_ctx = self._hooks.retrieve_chat_ctx().copy()
346-
self._run_eou_detection(chat_ctx)
350+
if not self._speaking:
351+
chat_ctx = self._hooks.retrieve_chat_ctx().copy()
352+
self._run_eou_detection(chat_ctx)
353+
354+
elif ev.type == stt.SpeechEventType.PREFLIGHT_TRANSCRIPT:
355+
self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None)
356+
transcript = ev.alternatives[0].text
357+
language = ev.alternatives[0].language
358+
confidence = ev.alternatives[0].confidence
359+
360+
if not self._last_language or (
361+
language and len(transcript) > MIN_LANGUAGE_DETECTION_LENGTH
362+
):
363+
self._last_language = language
364+
365+
if not transcript:
366+
return
367+
368+
logger.debug(
369+
"received user preflight transcript",
370+
extra={"user_transcript": transcript, "language": self._last_language},
371+
)
372+
373+
# still need to increment it as it's used for turn detection,
374+
self._last_final_transcript_time = time.time()
375+
# preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
376+
self._audio_preflight_transcript = (self._audio_transcript + " " + transcript).lstrip()
377+
self._audio_interim_transcript = transcript
378+
379+
if not self._vad or self._last_speaking_time == 0:
380+
# vad disabled, use stt timestamp
381+
self._last_speaking_time = time.time()
382+
383+
if self._turn_detection_mode != "manual" or self._user_turn_committed:
384+
confidence_vals = list(self._final_transcript_confidence) + [confidence]
385+
self._hooks.on_preemptive_generation(
386+
_PreemptiveGenerationInfo(
387+
new_transcript=self._audio_preflight_transcript,
388+
transcript_confidence=sum(confidence_vals) / len(confidence_vals),
389+
)
390+
)
347391

348392
elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
349393
self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None)
350394
self._audio_interim_transcript = ev.alternatives[0].text
351395

352396
elif ev.type == stt.SpeechEventType.END_OF_SPEECH and self._turn_detection_mode == "stt":
397+
with trace.use_span(self._ensure_user_turn_span()):
398+
self._hooks.on_end_of_speech(None)
399+
400+
self._speaking = False
353401
self._user_turn_committed = True
354-
if not self._speaking:
355-
# start response after vad fires END_OF_SPEECH to avoid vad interruption
356-
chat_ctx = self._hooks.retrieve_chat_ctx().copy()
357-
self._run_eou_detection(chat_ctx)
402+
self._last_speaking_time = time.time()
403+
404+
chat_ctx = self._hooks.retrieve_chat_ctx().copy()
405+
self._run_eou_detection(chat_ctx)
406+
407+
elif ev.type == stt.SpeechEventType.START_OF_SPEECH and self._turn_detection_mode == "stt":
408+
with trace.use_span(self._ensure_user_turn_span()):
409+
self._hooks.on_start_of_speech(None)
410+
411+
self._speaking = True
412+
self._last_speaking_time = time.time()
413+
414+
if self._end_of_turn_task is not None:
415+
self._end_of_turn_task.cancel()
358416

359417
async def _on_vad_event(self, ev: vad.VADEvent) -> None:
360418
if ev.type == vad.VADEventType.START_OF_SPEECH:

livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@
2020
"""
2121

2222
from .stt import STT, SpeechStream
23+
from .stt_v2 import SpeechStreamv2, STTv2
2324
from .tts import TTS
2425
from .version import __version__
2526

26-
__all__ = ["STT", "SpeechStream", "__version__", "TTS"]
27+
__all__ = ["STT", "SpeechStream", "STTv2", "SpeechStreamv2", "__version__", "TTS"]
2728

2829

2930
from livekit.agents import Plugin

livekit-plugins/livekit-plugins-deepgram/livekit/plugins/deepgram/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
"whisper-large",
3636
]
3737

38+
V2Models = Literal["flux-general-en"]
39+
3840
DeepgramLanguages = Literal[
3941
"zh",
4042
"zh-CN",

0 commit comments

Comments
 (0)