Add support for text to speech and speech to text (#863)

- Add support for text to speech, speech to text. Add loading and responsive indicators to reflect state. - When streaming for speech to text, show incremental transcription in the message input field - When streaming text to speech, and a pause button in the chat message to allow user to stop playback
2026-03-03 13:19:16 +00:00 · 2024-07-24 02:06:40 -07:00
parent 3e4325edab
commit 694bedc25b
14 changed files with 264 additions and 36 deletions
--- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
+++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
@@ -19,7 +19,10 @@ import {
    Notebook,
    Question,
    Robot,
-    Shapes
+    Shapes,
+    Stop,
+    Waveform,
+    WaveSine
 } from '@phosphor-icons/react';

 import {
@@ -48,6 +51,8 @@ import { PopoverTrigger } from '@radix-ui/react-popover';
 import Link from 'next/link';
 import { AlertDialogCancel } from '@radix-ui/react-alert-dialog';
 import LoginPrompt from '../loginPrompt/loginPrompt';
+import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip';
+import { InlineLoading } from '../loading/loading';

 export interface ChatOptions {
    [key: string]: string
@@ -96,6 +101,9 @@ export default function ChatInputArea(props: ChatInputProps) {
    const [loginRedirectMessage, setLoginRedirectMessage] = useState<string | null>(null);
    const [showLoginPrompt, setShowLoginPrompt] = useState(false);

+    const [recording, setRecording] = useState(false);
+    const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
+
    const [progressValue, setProgressValue] = useState(0);

    useEffect(() => {
@@ -195,6 +203,83 @@ export default function ChatInputArea(props: ChatInputProps) {
        return <ArrowRight className={className} />
    }

+    // Assuming this function is added within the same context as the provided excerpt
+    async function startRecordingAndTranscribe() {
+        try {
+            const microphone = await navigator.mediaDevices.getUserMedia({ audio: true });
+            const mediaRecorder = new MediaRecorder(microphone, { mimeType: 'audio/webm' });
+
+            const audioChunks: Blob[] = [];
+
+            mediaRecorder.ondataavailable = async (event) => {
+                audioChunks.push(event.data);
+                const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                const formData = new FormData();
+                formData.append('file', audioBlob);
+
+                // Send the incremental audio blob to the server
+                try {
+                    const response = await fetch('/api/transcribe', {
+                        method: 'POST',
+                        body: formData,
+                    });
+
+                    if (!response.ok) {
+                        throw new Error('Network response was not ok');
+                    }
+
+                    const transcription = await response.json();
+                    setMessage(transcription.text.trim());
+                } catch (error) {
+                    console.error('Error sending audio to server:', error);
+                }
+            };
+
+            // Send an audio blob every 1.5 seconds
+            mediaRecorder.start(1500);
+
+            mediaRecorder.onstop = async () => {
+                const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                const formData = new FormData();
+                formData.append('file', audioBlob);
+
+                // Send the audio blob to the server
+                try {
+                    const response = await fetch('/api/transcribe', {
+                        method: 'POST',
+                        body: formData,
+                    });
+
+                    if (!response.ok) {
+                        throw new Error('Network response was not ok');
+                    }
+
+                    const transcription = await response.json();
+                    mediaRecorder.stream.getTracks().forEach(track => track.stop());
+                    setMediaRecorder(null);
+                    setMessage(transcription.text.trim());
+                } catch (error) {
+                    console.error('Error sending audio to server:', error);
+                }
+            };
+
+            setMediaRecorder(mediaRecorder);
+        } catch (error) {
+            console.error("Error getting microphone", error);
+        }
+    }
+
+    useEffect(() => {
+        if (!recording && mediaRecorder) {
+            mediaRecorder.stop();
+        }
+
+        if (recording && !mediaRecorder) {
+            startRecordingAndTranscribe();
+        }
+
+    }, [recording]);
+
    return (
        <>
            {
@@ -321,21 +406,58 @@ export default function ChatInputArea(props: ChatInputProps) {
                            }
                        }}
                        onChange={(e) => setMessage(e.target.value)}
-                        disabled={props.sendDisabled} />
+                        disabled={props.sendDisabled || recording} />
                </div>
-                <Button
-                    variant={'ghost'}
-                    className="!bg-none p-1 h-auto text-3xl rounded-full text-gray-300 hover:text-gray-500"
-                    disabled={props.sendDisabled}>
-                    <Microphone weight='fill' className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
-                </Button>
+                {
+                    recording ?
+                        <TooltipProvider>
+                            <Tooltip>
+                                <TooltipTrigger asChild>
+                                    <Button
+                                        variant={'ghost'}
+                                        className="!bg-none p-1 h-auto text-3xl rounded-full text-gray-300 hover:text-gray-500"
+                                        onClick={() => setRecording(!recording)}
+                                        disabled={props.sendDisabled}
+                                    >
+                                        <Stop weight='fill' className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
+                                    </Button>
+                                </TooltipTrigger>
+                                <TooltipContent>
+                                    Click to stop recording and transcribe your voice.
+                                </TooltipContent>
+                            </Tooltip>
+                        </TooltipProvider>
+                        :
+                        (
+                            mediaRecorder ?
+                                <InlineLoading />
+                                :
+                                < TooltipProvider >
+                                    <Tooltip>
+                                        <TooltipTrigger asChild>
+                                            <Button
+                                                variant={'ghost'}
+                                                className="!bg-none p-1 h-auto text-3xl rounded-full text-gray-300 hover:text-gray-500"
+                                                onClick={() => setRecording(!recording)}
+                                                disabled={props.sendDisabled}
+                                            >
+                                                <Microphone weight='fill' className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
+                                            </Button>
+                                        </TooltipTrigger>
+                                        <TooltipContent>
+                                            Click to start recording and transcribe your voice.
+                                        </TooltipContent>
+                                    </Tooltip>
+                                </TooltipProvider>
+                        )
+                }
                <Button
                    className="bg-orange-300 hover:bg-orange-500 rounded-full p-0 h-auto text-3xl transition transform hover:-translate-y-1"
                    onClick={onSendMessage}
                    disabled={props.sendDisabled}>
                    <ArrowCircleUp className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
                </Button>
-            </div>
+            </div >
        </>
    )
 }