@@ -61,9 +61,9 @@ async def predict_end_of_turn(
6161
6262
6363class RecognitionHooks (Protocol ):
64- def on_start_of_speech (self , ev : vad .VADEvent ) -> None : ...
64+ def on_start_of_speech (self , ev : vad .VADEvent | None ) -> None : ...
6565 def on_vad_inference_done (self , ev : vad .VADEvent ) -> None : ...
66- def on_end_of_speech (self , ev : vad .VADEvent ) -> None : ...
66+ def on_end_of_speech (self , ev : vad .VADEvent | None ) -> None : ...
6767 def on_interim_transcript (self , ev : stt .SpeechEvent , * , speaking : bool | None ) -> None : ...
6868 def on_final_transcript (self , ev : stt .SpeechEvent ) -> None : ...
6969 def on_end_of_turn (self , info : _EndOfTurnInfo ) -> bool : ...
@@ -103,10 +103,13 @@ def __init__(
103103 self ._speaking = False
104104 self ._last_speaking_time : float = 0
105105 self ._last_final_transcript_time : float = 0
106+ # used for manual commit_user_turn
106107 self ._final_transcript_received = asyncio .Event ()
107108 self ._final_transcript_confidence : list [float ] = []
108109 self ._audio_transcript = ""
109110 self ._audio_interim_transcript = ""
111+ # used for STTs that support preflight mode, so it could start preemptive generation earlier
112+ self ._audio_preflight_transcript = ""
110113 self ._last_language : str | None = None
111114
112115 self ._stt_ch : aio .Chan [rtc .AudioFrame ] | None = None
@@ -191,6 +194,7 @@ def update_vad(self, vad: vad.VAD | None) -> None:
191194 def clear_user_turn (self ) -> None :
192195 self ._audio_transcript = ""
193196 self ._audio_interim_transcript = ""
197+ self ._audio_preflight_transcript = ""
194198 self ._final_transcript_confidence = []
195199 self ._user_turn_committed = False
196200
@@ -317,7 +321,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
317321 self ._audio_transcript += f" { transcript } "
318322 self ._audio_transcript = self ._audio_transcript .lstrip ()
319323 self ._final_transcript_confidence .append (confidence )
324+ transcript_changed = self ._audio_transcript != self ._audio_preflight_transcript
320325 self ._audio_interim_transcript = ""
326+ self ._audio_preflight_transcript = ""
321327 self ._final_transcript_received .set ()
322328
323329 if not self ._vad or self ._last_speaking_time == 0 :
@@ -328,7 +334,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
328334 # and using that timestamp for _last_speaking_time
329335 self ._last_speaking_time = time .time ()
330336
331- if self ._vad_base_turn_detection or self ._user_turn_committed :
337+ if transcript_changed and ( self ._vad_base_turn_detection or self ._user_turn_committed ) :
332338 self ._hooks .on_preemptive_generation (
333339 _PreemptiveGenerationInfo (
334340 new_transcript = self ._audio_transcript ,
@@ -341,20 +347,72 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
341347 )
342348 )
343349
344- if not self ._speaking :
345- chat_ctx = self ._hooks .retrieve_chat_ctx ().copy ()
346- self ._run_eou_detection (chat_ctx )
350+ if not self ._speaking :
351+ chat_ctx = self ._hooks .retrieve_chat_ctx ().copy ()
352+ self ._run_eou_detection (chat_ctx )
353+
354+ elif ev .type == stt .SpeechEventType .PREFLIGHT_TRANSCRIPT :
355+ self ._hooks .on_interim_transcript (ev , speaking = self ._speaking if self ._vad else None )
356+ transcript = ev .alternatives [0 ].text
357+ language = ev .alternatives [0 ].language
358+ confidence = ev .alternatives [0 ].confidence
359+
360+ if not self ._last_language or (
361+ language and len (transcript ) > MIN_LANGUAGE_DETECTION_LENGTH
362+ ):
363+ self ._last_language = language
364+
365+ if not transcript :
366+ return
367+
368+ logger .debug (
369+ "received user preflight transcript" ,
370+ extra = {"user_transcript" : transcript , "language" : self ._last_language },
371+ )
372+
373+ # still need to increment it as it's used for turn detection,
374+ self ._last_final_transcript_time = time .time ()
375+ # preflight transcript includes all pre-committed transcripts (including final transcript from the previous STT run)
376+ self ._audio_preflight_transcript = (self ._audio_transcript + " " + transcript ).lstrip ()
377+ self ._audio_interim_transcript = transcript
378+
379+ if not self ._vad or self ._last_speaking_time == 0 :
380+ # vad disabled, use stt timestamp
381+ self ._last_speaking_time = time .time ()
382+
383+ if self ._turn_detection_mode != "manual" or self ._user_turn_committed :
384+ confidence_vals = list (self ._final_transcript_confidence ) + [confidence ]
385+ self ._hooks .on_preemptive_generation (
386+ _PreemptiveGenerationInfo (
387+ new_transcript = self ._audio_preflight_transcript ,
388+ transcript_confidence = sum (confidence_vals ) / len (confidence_vals ),
389+ )
390+ )
347391
348392 elif ev .type == stt .SpeechEventType .INTERIM_TRANSCRIPT :
349393 self ._hooks .on_interim_transcript (ev , speaking = self ._speaking if self ._vad else None )
350394 self ._audio_interim_transcript = ev .alternatives [0 ].text
351395
352396 elif ev .type == stt .SpeechEventType .END_OF_SPEECH and self ._turn_detection_mode == "stt" :
397+ with trace .use_span (self ._ensure_user_turn_span ()):
398+ self ._hooks .on_end_of_speech (None )
399+
400+ self ._speaking = False
353401 self ._user_turn_committed = True
354- if not self ._speaking :
355- # start response after vad fires END_OF_SPEECH to avoid vad interruption
356- chat_ctx = self ._hooks .retrieve_chat_ctx ().copy ()
357- self ._run_eou_detection (chat_ctx )
402+ self ._last_speaking_time = time .time ()
403+
404+ chat_ctx = self ._hooks .retrieve_chat_ctx ().copy ()
405+ self ._run_eou_detection (chat_ctx )
406+
407+ elif ev .type == stt .SpeechEventType .START_OF_SPEECH and self ._turn_detection_mode == "stt" :
408+ with trace .use_span (self ._ensure_user_turn_span ()):
409+ self ._hooks .on_start_of_speech (None )
410+
411+ self ._speaking = True
412+ self ._last_speaking_time = time .time ()
413+
414+ if self ._end_of_turn_task is not None :
415+ self ._end_of_turn_task .cancel ()
358416
359417 async def _on_vad_event (self , ev : vad .VADEvent ) -> None :
360418 if ev .type == vad .VADEventType .START_OF_SPEECH :
0 commit comments