diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java new file mode 100644 index 00000000..d5483a44 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java @@ -0,0 +1,12 @@ +package ai.javaclaw.speech; + +public class SpeechToTextException extends RuntimeException { + + public SpeechToTextException(String message) { + super(message); + } + + public SpeechToTextException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java new file mode 100644 index 00000000..ac9f8606 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java @@ -0,0 +1,8 @@ +package ai.javaclaw.speech; + +import java.io.InputStream; + +public interface SpeechToTextService { + + String transcribe(InputStream audioStream); +} diff --git a/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java new file mode 100644 index 00000000..4b795dc6 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java @@ -0,0 +1,121 @@ +package ai.javaclaw.speech; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.concurrent.TimeUnit; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp") +public class WhisperCppSpeechToTextService implements SpeechToTextService { + + private static final Logger LOGGER = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class); + + private final String modelPath; + + public WhisperCppSpeechToTextService( + @Value("${speech.whisper-cpp.model-path}") String modelPath) { + this.modelPath = modelPath; + } + + @Override + public String transcribe(InputStream audioStream) { + LOGGER.info("Transcribing audio via whisper-cpp (model: {})", modelPath); + + Path oggFile = null; + Path wavFile = null; + Path outputFile = null; + + try { + oggFile = Files.createTempFile("whisper-input-", ".ogg"); + Files.write(oggFile, audioStream.readAllBytes()); + + wavFile = Files.createTempFile("whisper-input-", ".wav"); + convertOggToWav(oggFile, wavFile); + + outputFile = Files.createTempFile("whisper-output-", ".txt"); + Files.deleteIfExists(outputFile); + + ProcessBuilder pb = new ProcessBuilder( + "whisper-cli", + "-m", modelPath, + "-f", wavFile.toString(), + "-otxt", + "-of", outputFile.toString().replace(".txt", ""), + "--no-prints" + ); + pb.redirectErrorStream(true); + + Process process = pb.start(); + boolean finished = process.waitFor(60, TimeUnit.SECONDS); + + if (!finished) { + process.destroyForcibly(); + throw new SpeechToTextException("whisper-cli timed out after 60 seconds"); + } + + if (process.exitValue() != 0) { + String error = new String(process.getInputStream().readAllBytes()); + throw new SpeechToTextException("whisper-cli exited with code " + process.exitValue() + ": " + error); + } + + if (!Files.exists(outputFile)) { + throw new SpeechToTextException("whisper-cli did not produce output file"); + } + + String text = Files.readString(outputFile).trim(); + if (text.isBlank()) { + throw new SpeechToTextException("whisper-cli returned empty transcription"); + } + + LOGGER.info("whisper-cpp transcription completed successfully"); + return text; + + } catch (IOException | InterruptedException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new SpeechToTextException("Failed to run whisper-cli", e); + } finally { + deleteSilently(oggFile); + deleteSilently(wavFile); + deleteSilently(outputFile); + } + } + + private void convertOggToWav(Path oggFile, Path wavFile) throws IOException, InterruptedException { + ProcessBuilder pb = new ProcessBuilder( + "ffmpeg", "-y", "-i", oggFile.toString(), "-ar", "16000", "-ac", "1", wavFile.toString() + ); + pb.redirectErrorStream(true); + + Process process = pb.start(); + boolean finished = process.waitFor(30, TimeUnit.SECONDS); + + if (!finished) { + process.destroyForcibly(); + throw new SpeechToTextException("ffmpeg conversion timed out"); + } + + if (process.exitValue() != 0) { + String error = new String(process.getInputStream().readAllBytes()); + throw new SpeechToTextException("ffmpeg conversion failed: " + error); + } + } + + private void deleteSilently(Path path) { + if (path != null) { + try { + Files.deleteIfExists(path); + } catch (IOException ignored) { + } + } + } +} diff --git a/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java new file mode 100644 index 00000000..f4dfcd02 --- /dev/null +++ b/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java @@ -0,0 +1,11 @@ +package ai.javaclaw.speech; + +import java.io.InputStream; + +public class MockSpeechToTextService implements SpeechToTextService { + + @Override + public String transcribe(InputStream audioStream) { + return "[voice message]"; + } +} diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java index 018c427f..05798422 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java @@ -4,7 +4,11 @@ import ai.javaclaw.channels.Channel; import ai.javaclaw.channels.ChannelMessageReceivedEvent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient; @@ -17,11 +21,11 @@ import org.telegram.telegrambots.meta.api.objects.message.Message; import org.telegram.telegrambots.meta.exceptions.TelegramApiException; import org.telegram.telegrambots.meta.generics.TelegramClient; -import org.commonmark.node.Node; -import org.commonmark.parser.Parser; -import org.commonmark.renderer.html.HtmlRenderer; +import java.io.IOException; +import java.io.InputStream; import java.util.List; +import java.util.Optional; import static java.util.Optional.ofNullable; @@ -40,18 +44,26 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli private final TelegramClient telegramClient; private final Agent agent; private final ChannelRegistry channelRegistry; + private final SpeechToTextService speechToTextService; + private final TelegramVoiceDownloader voiceDownloader; private Long chatId; - public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) { - this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry); + public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService); + } + + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken)); } - TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) { + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) { this.botToken = botToken; this.allowedUsername = normalizeUsername(allowedUsername); this.telegramClient = telegramClient; this.agent = agent; this.channelRegistry = channelRegistry; + this.speechToTextService = speechToTextService; + this.voiceDownloader = voiceDownloader; channelRegistry.registerChannel(this); LOGGER.info("Started Telegram integration"); } @@ -68,7 +80,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() { @Override public void consume(Update update) { - if (!(update.hasMessage() && update.getMessage().hasText())) return; + if (!update.hasMessage()) return; Message requestMessage = update.getMessage(); String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName(); @@ -78,11 +90,13 @@ public void consume(Update update) { return; } - String messageText = requestMessage.getText(); + Optional messageText = resolveMessageText(requestMessage); + if (messageText.isEmpty()) return; + this.chatId = requestMessage.getChatId(); Integer messageThreadId = requestMessage.getMessageThreadId(); - channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId)); - String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText); + channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId)); + String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get()); sendMessage(chatId, messageThreadId, response); } @@ -124,6 +138,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) { } } + private Optional resolveMessageText(Message message) { + if (message.hasText()) { + return Optional.of(message.getText()); + } + if (message.hasVoice()) { + return transcribeVoice(message); + } + return Optional.empty(); + } + + private Optional transcribeVoice(Message message) { + LOGGER.info("Voice message received, downloading audio"); + try (InputStream voiceStream = voiceDownloader.download(message)) { + String transcribed = speechToTextService.transcribe(voiceStream); + LOGGER.info("Voice message transcribed successfully"); + return Optional.of(transcribed); + } catch (IOException | TelegramApiException e) { + LOGGER.error("Failed to process voice message", e); + return Optional.empty(); + } + } + private String convertMarkdownToTelegramHtml(String markdown) { if (markdown == null || markdown.isBlank()) return ""; @@ -181,4 +217,4 @@ public Integer getMessageThreadId() { return messageThreadId; } } -} \ No newline at end of file +} diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java index 76bb8c49..a007c610 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java @@ -3,6 +3,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.AutoConfiguration; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; @@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration { public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken, @Value("${agent.channels.telegram.username:null}") String allowedUsername, Agent agent, - ChannelRegistry channelRegistry) { - return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry); + ChannelRegistry channelRegistry, + SpeechToTextService speechToTextService) { + return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService); } } diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java new file mode 100644 index 00000000..5ab8a040 --- /dev/null +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java @@ -0,0 +1,29 @@ +package ai.javaclaw.channels.telegram; + +import org.telegram.telegrambots.meta.api.methods.GetFile; +import org.telegram.telegrambots.meta.api.objects.message.Message; +import org.telegram.telegrambots.meta.exceptions.TelegramApiException; +import org.telegram.telegrambots.meta.generics.TelegramClient; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; + +class TelegramVoiceDownloader { + + private final TelegramClient telegramClient; + private final String botToken; + + TelegramVoiceDownloader(TelegramClient telegramClient, String botToken) { + this.telegramClient = telegramClient; + this.botToken = botToken; + } + + InputStream download(Message message) throws TelegramApiException, IOException { + String fileId = message.getVoice().getFileId(); + GetFile getFile = new GetFile(fileId); + String filePath = telegramClient.execute(getFile).getFilePath(); + String fileUrl = "https://api.telegram.org/file/bot" + botToken + "/" + filePath; + return URI.create(fileUrl).toURL().openStream(); + } +} diff --git a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java index a8d0378b..d688eeec 100644 --- a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java +++ b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java @@ -2,6 +2,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -32,6 +33,9 @@ class TelegramChannelTest { @Mock private Agent agent; + @Mock + private SpeechToTextService speechToTextService; + // ----------------------------------------------------------------------- // Ignored updates // ----------------------------------------------------------------------- @@ -48,13 +52,17 @@ void ignoresUpdatesWithoutMessage() { } @Test - void ignoresUpdatesWithoutText() { + void ignoresUpdatesWithoutTextOrVoice() { TelegramChannel channel = channel("allowed_user"); Update update = mock(Update.class); Message message = mock(Message.class); + User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); + when(message.getFrom()).thenReturn(user); + when(user.getUserName()).thenReturn("allowed_user"); when(message.hasText()).thenReturn(false); + when(message.hasVoice()).thenReturn(false); channel.consume(update); @@ -68,7 +76,6 @@ void ignoresMessagesFromNullUsername() { Message message = mock(Message.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(null); channel.consume(update); @@ -234,7 +241,7 @@ void sendMessageFallbacksToSendingRawTextWhenFailingToSendHtml() throws Telegram // ----------------------------------------------------------------------- private TelegramChannel channel(String allowedUsername) { - return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry()); + return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService); } private Update updateFromUnknownUser(String username) { @@ -243,7 +250,6 @@ private Update updateFromUnknownUser(String username) { User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(user); when(user.getUserName()).thenReturn(username); return update; diff --git a/providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java b/providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java new file mode 100644 index 00000000..2d354b8d --- /dev/null +++ b/providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java @@ -0,0 +1,44 @@ +package ai.javaclaw.providers.openai; + +import ai.javaclaw.speech.SpeechToTextException; +import ai.javaclaw.speech.SpeechToTextService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.ai.audio.transcription.AudioTranscriptionPrompt; +import org.springframework.ai.audio.transcription.AudioTranscriptionResponse; +import org.springframework.ai.openai.OpenAiAudioTranscriptionModel; +import org.springframework.boot.autoconfigure.condition.ConditionalOnBean; +import org.springframework.core.io.InputStreamResource; +import org.springframework.stereotype.Service; + +import java.io.InputStream; + +@Service +@ConditionalOnBean(OpenAiAudioTranscriptionModel.class) +public class OpenAiSpeechToTextService implements SpeechToTextService { + + private static final Logger LOGGER = LoggerFactory.getLogger(OpenAiSpeechToTextService.class); + + private final OpenAiAudioTranscriptionModel transcriptionModel; + + public OpenAiSpeechToTextService(OpenAiAudioTranscriptionModel transcriptionModel) { + this.transcriptionModel = transcriptionModel; + } + + @Override + public String transcribe(InputStream audioStream) { + LOGGER.info("Transcribing audio via Spring AI OpenAI transcription"); + + AudioTranscriptionResponse response = transcriptionModel.call( + new AudioTranscriptionPrompt(new InputStreamResource(audioStream)) + ); + + String text = response.getResult().getOutput(); + if (text == null || text.isBlank()) { + throw new SpeechToTextException("OpenAI returned empty transcription"); + } + + LOGGER.info("OpenAI transcription completed successfully"); + return text.trim(); + } +}