diff --git a/apps/web/src/components/ChatView.browser.tsx b/apps/web/src/components/ChatView.browser.tsx index faecc7f51b..d31a09b48c 100644 --- a/apps/web/src/components/ChatView.browser.tsx +++ b/apps/web/src/components/ChatView.browser.tsx @@ -21,6 +21,7 @@ import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it, vi } import { render } from "vitest-browser-react"; import { useComposerDraftStore } from "../composerDraftStore"; +import { stopPlayback } from "../features/tts/tts"; import { isMacPlatform } from "../lib/utils"; import { getRouter } from "../router"; import { useStore } from "../store"; @@ -91,6 +92,23 @@ interface MountedChatView { router: ReturnType; } +class MockSpeechSynthesisUtterance { + readonly text: string; + lang = ""; + voice: SpeechSynthesisVoice | null = null; + onend: (() => void) | null = null; + onerror: ((event: { error?: string }) => void) | null = null; + + constructor(text: string) { + this.text = text; + } +} + +interface BrowserSpeechMockState { + readonly speakCalls: MockSpeechSynthesisUtterance[]; + cancelCount: number; +} + function isoAt(offsetSeconds: number): string { return new Date(BASE_TIME_MS + offsetSeconds * 1_000).toISOString(); } @@ -150,6 +168,49 @@ function createAssistantMessage(options: { id: MessageId; text: string; offsetSe }; } +function createSnapshotForAssistantTts(options: { + assistantMessages: ReadonlyArray<{ + id: MessageId; + text: string; + streaming?: boolean; + }>; +}): OrchestrationReadModel { + const baseSnapshot = createSnapshotForTargetUser({ + targetMessageId: "msg-user-assistant-tts" as MessageId, + targetText: "assistant tts target", + }); + + return { + ...baseSnapshot, + threads: baseSnapshot.threads.map((thread) => + thread.id === THREAD_ID + ? { + ...thread, + messages: options.assistantMessages.flatMap((assistantMessage, index) => { + const offsetSeconds = index * 4; + return [ + createUserMessage({ + id: `msg-user-assistant-tts-${index}` as MessageId, + text: `user message ${index + 1}`, + offsetSeconds, + }), + { + ...createAssistantMessage({ + id: assistantMessage.id, + text: assistantMessage.text, + offsetSeconds: offsetSeconds + 1, + }), + streaming: Boolean(assistantMessage.streaming), + updatedAt: isoAt(offsetSeconds + 2), + }, + ]; + }), + } + : thread, + ), + }; +} + function createSnapshotForTargetUser(options: { targetMessageId: MessageId; targetText: string; @@ -543,6 +604,47 @@ async function waitForInteractionModeButton( ); } +async function waitForButtonByTitle(title: string): Promise { + return waitForElement( + () => + Array.from(document.querySelectorAll("button")).find( + (button) => button.title === title, + ) as HTMLButtonElement | null, + `Unable to find button titled "${title}".`, + ); +} + +function queryButtonByTitle(title: string): HTMLButtonElement | null { + return ( + (Array.from(document.querySelectorAll("button")).find((button) => button.title === title) as + | HTMLButtonElement + | undefined) ?? null + ); +} + +function installSpeechSynthesisMock(): BrowserSpeechMockState { + const state: BrowserSpeechMockState = { + speakCalls: [], + cancelCount: 0, + }; + + vi.stubGlobal("speechSynthesis", { + cancel: vi.fn(() => { + state.cancelCount += 1; + }), + getVoices: vi.fn(() => [{ default: true, lang: "en-US" } as SpeechSynthesisVoice]), + speak: vi.fn((utterance: SpeechSynthesisUtterance) => { + state.speakCalls.push(utterance as unknown as MockSpeechSynthesisUtterance); + }), + } satisfies Partial); + vi.stubGlobal( + "SpeechSynthesisUtterance", + MockSpeechSynthesisUtterance as unknown as typeof SpeechSynthesisUtterance, + ); + + return state; +} + async function waitForImagesToLoad(scope: ParentNode): Promise { const images = Array.from(scope.querySelectorAll("img")); if (images.length === 0) { @@ -716,6 +818,8 @@ describe("ChatView timeline estimator parity (full app)", () => { localStorage.clear(); document.body.innerHTML = ""; wsRequests.length = 0; + stopPlayback(); + vi.unstubAllGlobals(); useComposerDraftStore.setState({ draftsByThreadId: {}, draftThreadsByThreadId: {}, @@ -729,6 +833,8 @@ describe("ChatView timeline estimator parity (full app)", () => { }); afterEach(() => { + stopPlayback(); + vi.unstubAllGlobals(); document.body.innerHTML = ""; }); @@ -1247,4 +1353,153 @@ describe("ChatView timeline estimator parity (full app)", () => { await mounted.cleanup(); } }); + + it("renders a TTS play button for completed assistant messages and speaks sanitized text", async () => { + const speech = installSpeechSynthesisMock(); + const mounted = await mountChatView({ + viewport: DEFAULT_VIEWPORT, + snapshot: createSnapshotForAssistantTts({ + assistantMessages: [ + { + id: "msg-assistant-tts-play" as MessageId, + text: [ + "Here is the answer.", + "", + "```ts", + "const value = 1;", + "```", + "", + "See [the docs](https://example.com/docs).", + ].join("\n"), + }, + ], + }), + }); + + try { + const playButton = await waitForButtonByTitle("Play message"); + playButton.click(); + + await vi.waitFor( + () => { + expect(speech.speakCalls).toHaveLength(1); + expect(speech.speakCalls[0]?.text).toBe( + [ + "Here is the answer.", + "", + "TypeScript Code Block - Open the chat to view the code.", + "", + "See the docs.", + ].join("\n"), + ); + expect(queryButtonByTitle("Stop playback")).toBeTruthy(); + }, + { timeout: 8_000, interval: 16 }, + ); + } finally { + await mounted.cleanup(); + } + }); + + it("does not render a TTS button for streaming assistant messages", async () => { + installSpeechSynthesisMock(); + const mounted = await mountChatView({ + viewport: DEFAULT_VIEWPORT, + snapshot: createSnapshotForAssistantTts({ + assistantMessages: [ + { + id: "msg-assistant-tts-streaming" as MessageId, + text: "Still streaming", + streaming: true, + }, + ], + }), + }); + + try { + await waitForLayout(); + expect(queryButtonByTitle("Play message")).toBeNull(); + expect(queryButtonByTitle("Stop playback")).toBeNull(); + } finally { + await mounted.cleanup(); + } + }); + + it("stops playback when the active assistant TTS button is clicked again", async () => { + const speech = installSpeechSynthesisMock(); + const mounted = await mountChatView({ + viewport: DEFAULT_VIEWPORT, + snapshot: createSnapshotForAssistantTts({ + assistantMessages: [ + { + id: "msg-assistant-tts-stop" as MessageId, + text: "Stop me if you have heard enough.", + }, + ], + }), + }); + + try { + (await waitForButtonByTitle("Play message")).click(); + (await waitForButtonByTitle("Stop playback")).click(); + + await vi.waitFor( + () => { + expect(speech.cancelCount).toBe(2); + expect(queryButtonByTitle("Stop playback")).toBeNull(); + expect(queryButtonByTitle("Play message")).toBeTruthy(); + }, + { timeout: 8_000, interval: 16 }, + ); + } finally { + await mounted.cleanup(); + } + }); + + it("switches TTS playback when a different assistant message is selected", async () => { + const speech = installSpeechSynthesisMock(); + const mounted = await mountChatView({ + viewport: DEFAULT_VIEWPORT, + snapshot: createSnapshotForAssistantTts({ + assistantMessages: [ + { + id: "msg-assistant-tts-first" as MessageId, + text: "First response.", + }, + { + id: "msg-assistant-tts-second" as MessageId, + text: "Second response.", + }, + ], + }), + }); + + try { + let playButtons: HTMLButtonElement[] = []; + await vi.waitFor( + () => { + playButtons = Array.from(document.querySelectorAll("button")).filter( + (button) => button.title === "Play message", + ) as HTMLButtonElement[]; + expect(playButtons).toHaveLength(2); + }, + { timeout: 8_000, interval: 16 }, + ); + + playButtons[0]?.click(); + playButtons[1]?.click(); + + await vi.waitFor( + () => { + expect(speech.speakCalls).toHaveLength(2); + expect(speech.speakCalls[0]?.text).toBe("First response."); + expect(speech.speakCalls[1]?.text).toBe("Second response."); + expect(speech.cancelCount).toBe(2); + }, + { timeout: 8_000, interval: 16 }, + ); + } finally { + await mounted.cleanup(); + } + }); }); diff --git a/apps/web/src/components/chat/MessagesTimeline.tsx b/apps/web/src/components/chat/MessagesTimeline.tsx index e30801041f..00237de2be 100644 --- a/apps/web/src/components/chat/MessagesTimeline.tsx +++ b/apps/web/src/components/chat/MessagesTimeline.tsx @@ -36,6 +36,7 @@ import { computeMessageDurationStart, normalizeCompactToolLabel } from "./Messag import { cn } from "~/lib/utils"; import { type TimestampFormat } from "../../appSettings"; import { formatTimestamp } from "../../timestampFormat"; +import { AssistantMessageTtsButton } from "../../features/tts/AssistantMessageTtsButton"; const MAX_VISIBLE_WORK_LOG_ENTRIES = 6; const ALWAYS_UNVIRTUALIZED_TAIL_ROWS = 8; @@ -429,6 +430,11 @@ export const MessagesTimeline = memo(function MessagesTimeline({ cwd={markdownCwd} isStreaming={Boolean(row.message.streaming)} /> + {!row.message.streaming && row.message.text.trim().length > 0 ? ( +
+ +
+ ) : null} {(() => { const turnSummary = turnDiffSummaryByAssistantMessageId.get(row.message.id); if (!turnSummary) return null; diff --git a/apps/web/src/features/tts/AssistantMessageTtsButton.tsx b/apps/web/src/features/tts/AssistantMessageTtsButton.tsx new file mode 100644 index 0000000000..e84b44c7ee --- /dev/null +++ b/apps/web/src/features/tts/AssistantMessageTtsButton.tsx @@ -0,0 +1,32 @@ +import { memo } from "react"; +import { PlayIcon, SquareIcon } from "lucide-react"; +import { Button } from "~/components/ui/button"; +import { useMessageTts } from "./useMessageTts"; + +export const AssistantMessageTtsButton = memo(function AssistantMessageTtsButton({ + messageId, + text, +}: { + messageId: string; + text: string; +}) { + const { supported, canPlay, isPlaying, title, toggle } = useMessageTts(messageId, text); + + if (!supported || !canPlay) { + return null; + } + + return ( + + ); +}); diff --git a/apps/web/src/features/tts/nativeSpeechSynthesis.ts b/apps/web/src/features/tts/nativeSpeechSynthesis.ts new file mode 100644 index 0000000000..66f2ba55b2 --- /dev/null +++ b/apps/web/src/features/tts/nativeSpeechSynthesis.ts @@ -0,0 +1,120 @@ +export interface NativeSpeechSpeakInput { + readonly text: string; + readonly lang?: string; + readonly onEnd: () => void; + readonly onError: (error: Error) => void; +} + +export interface NativeSpeechController { + readonly isSupported: () => boolean; + readonly speak: (input: NativeSpeechSpeakInput) => void; + readonly stop: () => void; +} + +function getSpeechEnvironment(): { + readonly speechSynthesis?: SpeechSynthesis; + readonly SpeechSynthesisUtterance?: typeof SpeechSynthesisUtterance; + readonly navigator?: Navigator; + readonly document?: Document; +} { + const candidate = globalThis as typeof globalThis & { + speechSynthesis?: SpeechSynthesis; + SpeechSynthesisUtterance?: typeof SpeechSynthesisUtterance; + navigator?: Navigator; + document?: Document; + }; + + return { + speechSynthesis: candidate.speechSynthesis, + SpeechSynthesisUtterance: candidate.SpeechSynthesisUtterance, + navigator: candidate.navigator, + document: candidate.document, + }; +} + +function resolveSpeechLang(explicitLang?: string): string | undefined { + const lang = explicitLang?.trim(); + if (lang) { + return lang; + } + + const environment = getSpeechEnvironment(); + const documentLang = environment.document?.documentElement.lang?.trim(); + if (documentLang) { + return documentLang; + } + + const navigatorLang = environment.navigator?.language?.trim(); + return navigatorLang || undefined; +} + +function selectVoice( + speechSynthesis: SpeechSynthesis, + lang: string | undefined, +): SpeechSynthesisVoice | null { + const voices = speechSynthesis.getVoices(); + if (voices.length === 0) { + return null; + } + + if (!lang) { + return voices.find((voice) => voice.default) ?? voices[0] ?? null; + } + + const normalizedLang = lang.toLowerCase(); + const primaryLanguage = normalizedLang.split("-")[0]; + const exactMatch = voices.find((voice) => voice.lang.toLowerCase() === normalizedLang); + if (exactMatch) { + return exactMatch; + } + + const primaryMatch = voices.find((voice) => + voice.lang.toLowerCase().startsWith(`${primaryLanguage}-`), + ); + if (primaryMatch) { + return primaryMatch; + } + + return voices.find((voice) => voice.default) ?? voices[0] ?? null; +} + +export function createNativeSpeechController(): NativeSpeechController { + return { + isSupported() { + const environment = getSpeechEnvironment(); + return Boolean(environment.speechSynthesis && environment.SpeechSynthesisUtterance); + }, + + speak(input) { + const environment = getSpeechEnvironment(); + if (!environment.speechSynthesis || !environment.SpeechSynthesisUtterance) { + throw new Error("Speech synthesis unavailable."); + } + + const utterance = new environment.SpeechSynthesisUtterance(input.text); + const lang = resolveSpeechLang(input.lang); + if (lang) { + utterance.lang = lang; + } + + const voice = selectVoice(environment.speechSynthesis, utterance.lang || lang); + if (voice) { + utterance.voice = voice; + } + + utterance.onend = () => { + input.onEnd(); + }; + utterance.onerror = (event) => { + input.onError(new Error(event.error || "Speech synthesis failed.")); + }; + + environment.speechSynthesis.speak(utterance); + }, + + stop() { + const environment = getSpeechEnvironment(); + environment.speechSynthesis?.cancel(); + }, + }; +} diff --git a/apps/web/src/features/tts/sanitizeTtsText.test.ts b/apps/web/src/features/tts/sanitizeTtsText.test.ts new file mode 100644 index 0000000000..72fcb59aeb --- /dev/null +++ b/apps/web/src/features/tts/sanitizeTtsText.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it } from "vitest"; +import { sanitizeAssistantMessageForTts } from "./sanitizeTtsText"; + +describe("sanitizeAssistantMessageForTts", () => { + it("preserves plain prose while normalizing whitespace", () => { + expect(sanitizeAssistantMessageForTts("Hello world.\n\n")).toBe("Hello world."); + }); + + it("keeps markdown link labels and drops URLs", () => { + expect( + sanitizeAssistantMessageForTts("Read [the docs](https://example.com/docs) for more details."), + ).toBe("Read the docs for more details."); + }); + + it("flattens inline code into readable prose", () => { + expect(sanitizeAssistantMessageForTts("Run `bun lint` before shipping.")).toBe( + "Run bun lint before shipping.", + ); + }); + + it("replaces labeled code fences with a language-specific placeholder", () => { + expect(sanitizeAssistantMessageForTts("```ts\nconst value = 1;\n```")).toBe( + "TypeScript Code Block - Open the chat to view the code.", + ); + }); + + it("replaces unlabeled code fences with a generic placeholder", () => { + expect(sanitizeAssistantMessageForTts("```\nconst value = 1;\n```")).toBe( + "Code Block - Open the chat to view the code.", + ); + }); + + it("replaces multiple code fences independently", () => { + expect( + sanitizeAssistantMessageForTts( + ["```python", "print('hi')", "```", "", "```sh", "echo hi", "```"].join("\n"), + ), + ).toBe( + [ + "Python Code Block - Open the chat to view the code.", + "", + "Shell Code Block - Open the chat to view the code.", + ].join("\n"), + ); + }); + + it("returns empty string for markdown-only filler without speakable text", () => { + expect(sanitizeAssistantMessageForTts("###\n\n---\n\n>")).toBe(""); + }); +}); diff --git a/apps/web/src/features/tts/sanitizeTtsText.ts b/apps/web/src/features/tts/sanitizeTtsText.ts new file mode 100644 index 0000000000..8a38adf63a --- /dev/null +++ b/apps/web/src/features/tts/sanitizeTtsText.ts @@ -0,0 +1,106 @@ +const CODE_BLOCK_SUFFIX = "Code Block - Open the chat to view the code."; + +const CODE_LANGUAGE_LABELS: Record = { + bash: "Shell", + c: "C", + "c#": "C Sharp", + "c++": "C Plus Plus", + cpp: "C Plus Plus", + cs: "C Sharp", + css: "CSS", + go: "Go", + html: "HTML", + java: "Java", + javascript: "JavaScript", + js: "JavaScript", + json: "JSON", + jsx: "JavaScript", + markdown: "Markdown", + md: "Markdown", + php: "PHP", + py: "Python", + python: "Python", + rb: "Ruby", + ruby: "Ruby", + rust: "Rust", + sh: "Shell", + shell: "Shell", + sql: "SQL", + swift: "Swift", + ts: "TypeScript", + tsx: "TypeScript", + typescript: "TypeScript", + xml: "XML", + yaml: "YAML", + yml: "YAML", + zsh: "Shell", +}; + +function normalizeLanguageLabel(infoString: string): string | null { + const languageToken = infoString.trim().split(/\s+/)[0]?.trim().toLowerCase(); + if (!languageToken) { + return null; + } + + const knownLabel = CODE_LANGUAGE_LABELS[languageToken]; + if (knownLabel) { + return knownLabel; + } + + const cleaned = languageToken.replace(/[^a-z0-9#+.-]/gi, ""); + if (!cleaned || !/[a-z]/i.test(cleaned)) { + return null; + } + + return cleaned + .split(/[-_.]+/) + .filter((segment) => segment.length > 0) + .map((segment) => `${segment.slice(0, 1).toUpperCase()}${segment.slice(1)}`) + .join(" "); +} + +function buildCodeBlockPlaceholder(infoString: string): string { + const languageLabel = normalizeLanguageLabel(infoString); + if (!languageLabel) { + return CODE_BLOCK_SUFFIX; + } + + return `${languageLabel} ${CODE_BLOCK_SUFFIX}`; +} + +function hasSpeakableText(value: string): boolean { + return value.replace(/[^\p{L}\p{N}]+/gu, "").length > 0; +} + +export function sanitizeAssistantMessageForTts(text: string): string { + let sanitized = text.trim(); + if (sanitized.length === 0) { + return ""; + } + + sanitized = sanitized.replace( + /(^|\n)(```|~~~)([^\n]*)\n[\s\S]*?\n\2(?=\n|$)/g, + (_match, leadingBoundary: string, _fence: string, infoString: string) => + `${leadingBoundary}${buildCodeBlockPlaceholder(infoString)}\n`, + ); + + sanitized = sanitized.replace(/!\[([^\]]*)\]\((?:[^()\\]|\\.)+\)/g, "$1"); + sanitized = sanitized.replace(/\[([^\]]+)\]\((?:[^()\\]|\\.)+\)/g, "$1"); + sanitized = sanitized.replace(/\s]+>/g, ""); + sanitized = sanitized.replace(/https?:\/\/\S+/g, ""); + sanitized = sanitized.replace(/`([^`]+)`/g, "$1"); + sanitized = sanitized.replace(/^#{1,6}\s+/gm, ""); + sanitized = sanitized.replace(/^>\s?/gm, ""); + sanitized = sanitized.replace(/^[-*+]\s+/gm, ""); + sanitized = sanitized.replace(/^\d+\.\s+/gm, ""); + sanitized = sanitized.replace(/^[-*_]{3,}$/gm, ""); + sanitized = sanitized.replace(/[*_~]/g, ""); + sanitized = sanitized.replace(/<\/?[^>]+>/g, ""); + sanitized = sanitized.replace(/[ \t]*\|[ \t]*/g, " "); + sanitized = sanitized.replace(/[ \t]+\n/g, "\n"); + sanitized = sanitized.replace(/\n{3,}/g, "\n\n"); + sanitized = sanitized.replace(/[ \t]{2,}/g, " "); + sanitized = sanitized.trim(); + + return hasSpeakableText(sanitized) ? sanitized : ""; +} diff --git a/apps/web/src/features/tts/tts.test.ts b/apps/web/src/features/tts/tts.test.ts new file mode 100644 index 0000000000..357be5360e --- /dev/null +++ b/apps/web/src/features/tts/tts.test.ts @@ -0,0 +1,137 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { getSnapshot, isSupported, stopPlayback, toggleMessagePlayback } from "./tts"; + +class MockSpeechSynthesisUtterance { + readonly text: string; + lang = ""; + voice: SpeechSynthesisVoice | null = null; + onend: (() => void) | null = null; + onerror: ((event: { error?: string }) => void) | null = null; + + constructor(text: string) { + this.text = text; + } +} + +interface SpeechMockState { + readonly speakCalls: MockSpeechSynthesisUtterance[]; + cancelCount: number; +} + +const speechSynthesisDescriptor = Object.getOwnPropertyDescriptor(globalThis, "speechSynthesis"); +const utteranceDescriptor = Object.getOwnPropertyDescriptor(globalThis, "SpeechSynthesisUtterance"); + +function restoreSpeechGlobals(): void { + if (speechSynthesisDescriptor) { + Object.defineProperty(globalThis, "speechSynthesis", speechSynthesisDescriptor); + } else { + Reflect.deleteProperty(globalThis, "speechSynthesis"); + } + + if (utteranceDescriptor) { + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", utteranceDescriptor); + } else { + Reflect.deleteProperty(globalThis, "SpeechSynthesisUtterance"); + } +} + +function installSpeechMock(): SpeechMockState { + const state: SpeechMockState = { + speakCalls: [], + cancelCount: 0, + }; + + Object.defineProperty(globalThis, "speechSynthesis", { + configurable: true, + value: { + cancel: vi.fn(() => { + state.cancelCount += 1; + }), + getVoices: vi.fn(() => []), + speak: vi.fn((utterance: SpeechSynthesisUtterance) => { + state.speakCalls.push(utterance as unknown as MockSpeechSynthesisUtterance); + }), + } satisfies Partial, + }); + Object.defineProperty(globalThis, "SpeechSynthesisUtterance", { + configurable: true, + value: MockSpeechSynthesisUtterance as unknown as typeof SpeechSynthesisUtterance, + }); + + return state; +} + +describe("tts", () => { + beforeEach(() => { + restoreSpeechGlobals(); + stopPlayback(); + }); + + afterEach(() => { + restoreSpeechGlobals(); + stopPlayback(); + }); + + it("reports unsupported state when native speech synthesis is unavailable", () => { + expect(isSupported()).toBe(false); + expect(getSnapshot().status).toBe("unsupported"); + }); + + it("starts playback for a message and exposes the active snapshot", () => { + const speech = installSpeechMock(); + + toggleMessagePlayback({ + messageId: "message-1", + text: "Read this response aloud.", + }); + + expect(speech.speakCalls).toHaveLength(1); + expect(speech.speakCalls[0]?.text).toBe("Read this response aloud."); + expect(getSnapshot()).toMatchObject({ + status: "playing", + activeMessageId: "message-1", + provider: "native", + }); + }); + + it("stops playback when toggling the active message again", () => { + const speech = installSpeechMock(); + + toggleMessagePlayback({ + messageId: "message-1", + text: "Read this response aloud.", + }); + toggleMessagePlayback({ + messageId: "message-1", + text: "Read this response aloud.", + }); + + expect(speech.cancelCount).toBe(2); + expect(getSnapshot().status).toBe("idle"); + expect(getSnapshot().activeMessageId).toBeNull(); + }); + + it("switches playback to a different message and ignores stale completion callbacks", () => { + const speech = installSpeechMock(); + + toggleMessagePlayback({ + messageId: "message-1", + text: "First message.", + }); + const firstUtterance = speech.speakCalls[0]!; + + toggleMessagePlayback({ + messageId: "message-2", + text: "Second message.", + }); + + firstUtterance.onend?.(); + + expect(speech.cancelCount).toBe(2); + expect(speech.speakCalls).toHaveLength(2); + expect(getSnapshot()).toMatchObject({ + status: "playing", + activeMessageId: "message-2", + }); + }); +}); diff --git a/apps/web/src/features/tts/tts.ts b/apps/web/src/features/tts/tts.ts new file mode 100644 index 0000000000..174dc7e9b1 --- /dev/null +++ b/apps/web/src/features/tts/tts.ts @@ -0,0 +1,147 @@ +import { createNativeSpeechController } from "./nativeSpeechSynthesis"; + +export type TtsProviderKind = "native" | "openai" | "elevenlabs"; +export type TtsPlaybackStatus = "idle" | "playing" | "unsupported" | "error"; + +export interface TtsSnapshot { + readonly status: TtsPlaybackStatus; + readonly activeMessageId: string | null; + readonly provider: TtsProviderKind; + readonly errorMessage?: string; +} + +export interface SpeakMessageInput { + readonly messageId: string; + readonly text: string; + readonly lang?: string; +} + +const listeners = new Set<() => void>(); + +let playbackGeneration = 0; +let snapshot: TtsSnapshot = { + status: "idle", + activeMessageId: null, + provider: "native", +}; + +function emitChange(): void { + for (const listener of listeners) { + listener(); + } +} + +function setSnapshot(nextSnapshot: TtsSnapshot): void { + snapshot = nextSnapshot; + emitChange(); +} + +function buildIdleSnapshot(): TtsSnapshot { + return { + status: "idle", + activeMessageId: null, + provider: "native", + }; +} + +function asError(error: unknown): Error { + if (error instanceof Error) { + return error; + } + + return new Error(typeof error === "string" ? error : "Speech synthesis failed."); +} + +export function subscribe(listener: () => void): () => void { + listeners.add(listener); + return () => { + listeners.delete(listener); + }; +} + +export function isSupported(): boolean { + return createNativeSpeechController().isSupported(); +} + +export function getSnapshot(): TtsSnapshot { + if (!isSupported() && snapshot.status === "idle") { + return { + ...snapshot, + status: "unsupported", + }; + } + + return snapshot; +} + +export function stopPlayback(): void { + playbackGeneration += 1; + createNativeSpeechController().stop(); + setSnapshot(buildIdleSnapshot()); +} + +export function toggleMessagePlayback(input: SpeakMessageInput): void { + const trimmedText = input.text.trim(); + if (trimmedText.length === 0) { + return; + } + + const controller = createNativeSpeechController(); + if (!controller.isSupported()) { + setSnapshot({ + status: "unsupported", + activeMessageId: null, + provider: "native", + }); + return; + } + + if (snapshot.status === "playing" && snapshot.activeMessageId === input.messageId) { + stopPlayback(); + return; + } + + const nextGeneration = playbackGeneration + 1; + playbackGeneration = nextGeneration; + controller.stop(); + setSnapshot({ + status: "playing", + activeMessageId: input.messageId, + provider: "native", + }); + + try { + controller.speak({ + text: trimmedText, + ...(input.lang ? { lang: input.lang } : {}), + onEnd: () => { + if (playbackGeneration !== nextGeneration) { + return; + } + setSnapshot(buildIdleSnapshot()); + }, + onError: (error) => { + if (playbackGeneration !== nextGeneration) { + return; + } + setSnapshot({ + status: "error", + activeMessageId: null, + provider: "native", + errorMessage: error.message, + }); + }, + }); + } catch (error) { + if (playbackGeneration !== nextGeneration) { + return; + } + const resolvedError = asError(error); + setSnapshot({ + status: "error", + activeMessageId: null, + provider: "native", + errorMessage: resolvedError.message, + }); + } +} diff --git a/apps/web/src/features/tts/useMessageTts.ts b/apps/web/src/features/tts/useMessageTts.ts new file mode 100644 index 0000000000..14758648b6 --- /dev/null +++ b/apps/web/src/features/tts/useMessageTts.ts @@ -0,0 +1,28 @@ +import { useSyncExternalStore } from "react"; +import { sanitizeAssistantMessageForTts } from "./sanitizeTtsText"; +import { getSnapshot, isSupported, subscribe, toggleMessagePlayback } from "./tts"; + +export function useMessageTts(messageId: string, text: string) { + const snapshot = useSyncExternalStore(subscribe, getSnapshot, getSnapshot); + const sanitizedText = sanitizeAssistantMessageForTts(text); + const supported = isSupported(); + const isPlaying = snapshot.status === "playing" && snapshot.activeMessageId === messageId; + const canPlay = supported && sanitizedText.length > 0; + + return { + supported, + isPlaying, + canPlay, + title: isPlaying ? "Stop playback" : "Play message", + toggle() { + if (!canPlay) { + return; + } + + toggleMessagePlayback({ + messageId, + text: sanitizedText, + }); + }, + } as const; +}