Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 104 additions & 1 deletion app/src/features/human/useHumanMascot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';

import type { ChatEventListeners } from '../../services/chatService';
import { VISEMES } from './Mascot/visemes';
import { ACK_FACE_HOLD_MS, pickViseme, useHumanMascot } from './useHumanMascot';
import {
ACK_FACE_HOLD_MS,
pickConversationAckFace,
pickViseme,
useHumanMascot,
} from './useHumanMascot';
import { type PlaybackHandle, playBase64Audio } from './voice/audioPlayer';
import { synthesizeSpeech } from './voice/ttsClient';

Expand Down Expand Up @@ -133,6 +138,46 @@ describe('pickViseme', () => {
});
});

describe('pickConversationAckFace', () => {
it('prefers explicit reaction emoji from chat_done', () => {
expect(pickConversationAckFace({ full_response: 'Done', reaction_emoji: '✅' })).toBe('happy');
expect(pickConversationAckFace({ full_response: 'Done', reaction_emoji: '🤔' })).toBe(
'confused'
);
expect(pickConversationAckFace({ full_response: 'Done', reaction_emoji: '⚠️' })).toBe(
'concerned'
);
});

it('falls back to deterministic response text cues', () => {
expect(
pickConversationAckFace({ full_response: 'All set, this is fixed.', reaction_emoji: null })
).toBe('happy');
expect(
pickConversationAckFace({
full_response: 'I need more detail to clarify which workspace you mean.',
reaction_emoji: null,
})
).toBe('confused');
expect(
pickConversationAckFace({
full_response: 'Sorry, the provider failed and I cannot continue.',
reaction_emoji: null,
})
).toBe('concerned');
});

it('returns null when there is no strong cue', () => {
expect(
pickConversationAckFace({ full_response: 'Here is the summary.', reaction_emoji: null })
).toBeNull();
});

it('returns null when the response text is missing', () => {
expect(pickConversationAckFace({ reaction_emoji: null })).toBeNull();
});
});

describe('useHumanMascot state machine', () => {
beforeEach(() => {
capturedListeners = null;
Expand Down Expand Up @@ -226,6 +271,42 @@ describe('useHumanMascot state machine', () => {
expect(result.current.face).toBe('idle');
});

it('uses reaction emoji for the post-turn acknowledgement face', () => {
const { result } = renderHook(() => useHumanMascot({ speakReplies: false }));
act(() => {
capturedListeners?.onDone?.(
fakeEvent({
full_response: 'I need more detail before I can choose.',
reaction_emoji: '🤔',
rounds_used: 1,
total_input_tokens: 1,
total_output_tokens: 1,
})
);
});
expect(result.current.face).toBe('confused');
act(() => {
vi.advanceTimersByTime(ACK_FACE_HOLD_MS + 1);
});
expect(result.current.face).toBe('idle');
});

it('uses response text cues when no reaction emoji is present', () => {
const { result } = renderHook(() => useHumanMascot({ speakReplies: false }));
act(() => {
capturedListeners?.onDone?.(
fakeEvent({
full_response: 'Sorry, that failed because the provider is unavailable.',
reaction_emoji: null,
rounds_used: 1,
total_input_tokens: 1,
total_output_tokens: 1,
})
);
});
expect(result.current.face).toBe('concerned');
});

it('holds concerned briefly on chat_error, then idles', () => {
const { result } = renderHook(() => useHumanMascot());
act(() => {
Expand Down Expand Up @@ -518,6 +599,28 @@ describe('useHumanMascot TTS playback', () => {
expect(result.current.face).toBe('idle');
});

it('shows concerned when audio playback cannot start', async () => {
(synthesizeSpeech as ReturnType<typeof vi.fn>).mockResolvedValueOnce({
audio_base64: 'AAA=',
audio_mime: 'audio/mpeg',
visemes: [{ viseme: 'aa', start_ms: 0, end_ms: 100 }],
});
(playBase64Audio as ReturnType<typeof vi.fn>).mockRejectedValueOnce(new Error('decode failed'));

const { result } = renderHook(() => useHumanMascot({ speakReplies: true }));
await act(async () => {
capturedListeners?.onDone?.(fakeDone('All set, this is fixed.'));
await Promise.resolve();
await Promise.resolve();
await Promise.resolve();
});
expect(result.current.face).toBe('concerned');
act(() => {
vi.advanceTimersByTime(ACK_FACE_HOLD_MS + 1);
});
expect(result.current.face).toBe('idle');
});

// Issue #1762 — the user-selected mascot voice id flows through to
// every TTS RPC the hook makes. The store-stub at module scope lets
// these specs pin the prop without standing up a Redux Provider.
Expand Down
53 changes: 47 additions & 6 deletions app/src/features/human/useHumanMascot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,40 @@ export function pickViseme(delta: string): VisemeShape {
}
}

type ConversationAckFace = Extract<MascotFace, 'happy' | 'confused' | 'concerned'>;
type ConversationAckEvent = { full_response?: string | null; reaction_emoji?: string | null };

const HAPPY_REACTION_EMOJIS = new Set(['✅', '🎉', '🙌', '😊', '😄', '👍', '💪']);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor] HAPPY_REACTION_EMOJIS contains 👍 but won't match skin-tone variants like 👍🏽 (they're distinct Unicode sequences). Same for other emoji sets. Probably fine if the backend sends base codepoints, but worth a note if you later see mismatches.

const CONFUSED_REACTION_EMOJIS = new Set(['🤔', '❓', '❔']);
const CONCERNED_REACTION_EMOJIS = new Set(['⚠️', '⚠', '🚨', '❌', '😕', '😟']);

const CONCERNED_TEXT_RE =
/\b(sorry|apolog(?:y|ize|ise)|failed|failure|error|cannot|can't|unable|blocked|problem)\b/i;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor] HAPPY_TEXT_RE — common words like done, ready, and nice can false-positive on neutral or negative messages. For example "I'm done explaining — the system is broken" matches donehappy. The concerned > confused > happy priority order already mitigates the worst cases, but consider tightening these to phrase-level anchors (e.g. all set, it's done, that's fixed) in a follow-up if you see odd expressions in practice.

const CONFUSED_TEXT_RE =
/\b(not sure|unclear|ambiguous|clarify|which one|need more|can you confirm|maybe)\b/i;
const HAPPY_TEXT_RE = /\b(done|completed|fixed|success|successful|ready|all set|great|nice)\b/i;

/**
* Map conversation-level meaning into the short acknowledgement face that
* follows a completed turn. Runtime activity still owns thinking/speaking
* states; this only decides the post-turn emotional beat.
*/
export function pickConversationAckFace(event: ConversationAckEvent): ConversationAckFace | null {
const reaction = event.reaction_emoji?.trim();
if (reaction) {
if (HAPPY_REACTION_EMOJIS.has(reaction)) return 'happy';
if (CONFUSED_REACTION_EMOJIS.has(reaction)) return 'confused';
if (CONCERNED_REACTION_EMOJIS.has(reaction)) return 'concerned';
}

const text = event.full_response?.trim() ?? '';
if (!text) return null;
if (CONCERNED_TEXT_RE.test(text)) return 'concerned';
if (CONFUSED_TEXT_RE.test(text)) return 'confused';
if (HAPPY_TEXT_RE.test(text)) return 'happy';
return null;
}

export interface UseHumanMascotOptions {
/** When true, post-stream replies are sent to ElevenLabs and the mouth
* follows the returned viseme timeline while the audio plays. */
Expand All @@ -99,9 +133,9 @@ export interface UseHumanMascotResult {
* - `iteration_start` round > 1 or `tool_call` → `confused` (heavy reasoning)
* - `tool_result success=false` → `concerned` (held briefly)
* - `text_delta` → `speaking`, pseudo-lipsync from the trailing letter
* - `chat_done` (no TTS) → `happy` (held briefly), then `idle`
* - `chat_done` (no TTS) → message-aware ack face (held briefly), then `idle`
* - `chat_done` (TTS enabled) → `thinking` while synthesizing → `speaking`
* with real visemes → `idle` when the audio ends
* with real visemes → message-aware ack face when the audio ends
* - `chat_error`, TTS failure → `concerned` (held briefly), then `idle`
* - `listening` option override → `listening` (highest priority)
*
Expand Down Expand Up @@ -187,13 +221,14 @@ export function useHumanMascot(options: UseHumanMascotOptions = {}): UseHumanMas
lastDeltaAtRef.current = window.performance.now();
},
onDone: e => {
const ackFace = pickConversationAckFace(e) ?? 'happy';
if (!speakRef.current || !e.full_response?.trim()) {
// Soft acknowledgement beat instead of snapping back to idle.
holdThenIdle('happy');
holdThenIdle(ackFace);
return;
}
// Fire-and-forget — startTtsPlayback owns its cleanup via finally.
void startTtsPlayback(e.full_response).catch(() => {});
void startTtsPlayback(e.full_response, ackFace).catch(() => {});
},
onError: () => {
// Bump seq to invalidate any in-flight startTtsPlayback awaiters.
Expand Down Expand Up @@ -225,7 +260,10 @@ export function useHumanMascot(options: UseHumanMascotOptions = {}): UseHumanMas
};
}, []);

async function startTtsPlayback(text: string): Promise<void> {
async function startTtsPlayback(
text: string,
ackFace: ConversationAckFace = 'happy'
): Promise<void> {
Comment thread
coderabbitai[bot] marked this conversation as resolved.
// Cancel any in-flight playback so its handle.ended callback can't reset
// state belonging to the new run.
const prev = playbackRef.current;
Expand Down Expand Up @@ -313,14 +351,17 @@ export function useHumanMascot(options: UseHumanMascotOptions = {}): UseHumanMas
// rethrow anything else so real decoder errors aren't masked.
swallowAudioStop(err);
}
} catch (err) {
if (isStillCurrent()) degraded = true;
throw err;
} finally {
if (isStillCurrent()) {
playbackRef.current = null;
visemeFramesRef.current = [];
if (degraded) {
holdThenIdle('concerned');
} else {
holdThenIdle('happy');
holdThenIdle(ackFace);
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions gitbooks/features/mascot/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ This is the headline use case and has its own page, see [Meeting Agents](meeting

The mascot has mood states (idle, thinking, listening, talking, surprised, dreaming) and it transitions between them based on what the agent is doing. When you start typing it shifts into a listening pose. When the model is reasoning, it shows that. When a tool call returns something noteworthy, it reacts. When you stop interacting for a while, it drifts into idle.

After a turn finishes, the desktop mascot also reads the conversation-level cue that arrives with the chat result. A success cue produces a short happy acknowledgement, uncertainty produces a confused acknowledgement, and warnings or failed outcomes produce a concerned acknowledgement. If no strong cue is present, it keeps the existing calm post-turn acknowledgement and falls back to idle.

It is meant to feel alive, not animated-on-rails.

### It remembers you
Expand Down
Loading