Add support for text to speech and speech to text (#863)

- Add support for text to speech, speech to text. Add loading and responsive indicators to reflect state.
- When streaming for speech to text, show incremental transcription in the message input field
- When streaming text to speech, and a pause button in the chat message to allow user to stop playback
This commit is contained in:
sabaimran
2024-07-24 02:06:40 -07:00
committed by GitHub
parent 3e4325edab
commit 694bedc25b
14 changed files with 264 additions and 36 deletions

View File

@@ -19,7 +19,10 @@ import {
Notebook,
Question,
Robot,
Shapes
Shapes,
Stop,
Waveform,
WaveSine
} from '@phosphor-icons/react';
import {
@@ -48,6 +51,8 @@ import { PopoverTrigger } from '@radix-ui/react-popover';
import Link from 'next/link';
import { AlertDialogCancel } from '@radix-ui/react-alert-dialog';
import LoginPrompt from '../loginPrompt/loginPrompt';
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip';
import { InlineLoading } from '../loading/loading';
export interface ChatOptions {
[key: string]: string
@@ -96,6 +101,9 @@ export default function ChatInputArea(props: ChatInputProps) {
const [loginRedirectMessage, setLoginRedirectMessage] = useState<string | null>(null);
const [showLoginPrompt, setShowLoginPrompt] = useState(false);
const [recording, setRecording] = useState(false);
const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
const [progressValue, setProgressValue] = useState(0);
useEffect(() => {
@@ -195,6 +203,83 @@ export default function ChatInputArea(props: ChatInputProps) {
return <ArrowRight className={className} />
}
// Assuming this function is added within the same context as the provided excerpt
async function startRecordingAndTranscribe() {
try {
const microphone = await navigator.mediaDevices.getUserMedia({ audio: true });
const mediaRecorder = new MediaRecorder(microphone, { mimeType: 'audio/webm' });
const audioChunks: Blob[] = [];
mediaRecorder.ondataavailable = async (event) => {
audioChunks.push(event.data);
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
const formData = new FormData();
formData.append('file', audioBlob);
// Send the incremental audio blob to the server
try {
const response = await fetch('/api/transcribe', {
method: 'POST',
body: formData,
});
if (!response.ok) {
throw new Error('Network response was not ok');
}
const transcription = await response.json();
setMessage(transcription.text.trim());
} catch (error) {
console.error('Error sending audio to server:', error);
}
};
// Send an audio blob every 1.5 seconds
mediaRecorder.start(1500);
mediaRecorder.onstop = async () => {
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
const formData = new FormData();
formData.append('file', audioBlob);
// Send the audio blob to the server
try {
const response = await fetch('/api/transcribe', {
method: 'POST',
body: formData,
});
if (!response.ok) {
throw new Error('Network response was not ok');
}
const transcription = await response.json();
mediaRecorder.stream.getTracks().forEach(track => track.stop());
setMediaRecorder(null);
setMessage(transcription.text.trim());
} catch (error) {
console.error('Error sending audio to server:', error);
}
};
setMediaRecorder(mediaRecorder);
} catch (error) {
console.error("Error getting microphone", error);
}
}
useEffect(() => {
if (!recording && mediaRecorder) {
mediaRecorder.stop();
}
if (recording && !mediaRecorder) {
startRecordingAndTranscribe();
}
}, [recording]);
return (
<>
{
@@ -321,21 +406,58 @@ export default function ChatInputArea(props: ChatInputProps) {
}
}}
onChange={(e) => setMessage(e.target.value)}
disabled={props.sendDisabled} />
disabled={props.sendDisabled || recording} />
</div>
<Button
variant={'ghost'}
className="!bg-none p-1 h-auto text-3xl rounded-full text-gray-300 hover:text-gray-500"
disabled={props.sendDisabled}>
<Microphone weight='fill' className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
</Button>
{
recording ?
<TooltipProvider>
<Tooltip>
<TooltipTrigger asChild>
<Button
variant={'ghost'}
className="!bg-none p-1 h-auto text-3xl rounded-full text-gray-300 hover:text-gray-500"
onClick={() => setRecording(!recording)}
disabled={props.sendDisabled}
>
<Stop weight='fill' className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
</Button>
</TooltipTrigger>
<TooltipContent>
Click to stop recording and transcribe your voice.
</TooltipContent>
</Tooltip>
</TooltipProvider>
:
(
mediaRecorder ?
<InlineLoading />
:
< TooltipProvider >
<Tooltip>
<TooltipTrigger asChild>
<Button
variant={'ghost'}
className="!bg-none p-1 h-auto text-3xl rounded-full text-gray-300 hover:text-gray-500"
onClick={() => setRecording(!recording)}
disabled={props.sendDisabled}
>
<Microphone weight='fill' className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
</Button>
</TooltipTrigger>
<TooltipContent>
Click to start recording and transcribe your voice.
</TooltipContent>
</Tooltip>
</TooltipProvider>
)
}
<Button
className="bg-orange-300 hover:bg-orange-500 rounded-full p-0 h-auto text-3xl transition transform hover:-translate-y-1"
onClick={onSendMessage}
disabled={props.sendDisabled}>
<ArrowCircleUp className={`${props.isMobileWidth ? 'w-6 h-6' : 'w-8 h-8'}`} />
</Button>
</div>
</div >
</>
)
}

View File

@@ -10,9 +10,10 @@ import 'katex/dist/katex.min.css';
import { TeaserReferencesSection, constructAllReferences } from '../referencePanel/referencePanel';
import { ThumbsUp, ThumbsDown, Copy, Brain, Cloud, Folder, Book, Aperture, SpeakerHigh, MagnifyingGlass } from '@phosphor-icons/react';
import { ThumbsUp, ThumbsDown, Copy, Brain, Cloud, Folder, Book, Aperture, SpeakerHigh, MagnifyingGlass, Pause } from '@phosphor-icons/react';
import * as DomPurify from 'dompurify';
import { InlineLoading } from '../loading/loading';
const md = new markdownIt({
html: true,
@@ -206,8 +207,16 @@ export default function ChatMessage(props: ChatMessageProps) {
const [copySuccess, setCopySuccess] = useState<boolean>(false);
const [isHovering, setIsHovering] = useState<boolean>(false);
const [markdownRendered, setMarkdownRendered] = useState<string>('');
const [isPlaying, setIsPlaying] = useState<boolean>(false);
const [interrupted, setInterrupted] = useState<boolean>(false);
const interruptedRef = useRef<boolean>(false);
const messageRef = useRef<HTMLDivElement>(null);
useEffect(() => {
interruptedRef.current = interrupted;
}, [interrupted]);
useEffect(() => {
let message = props.chatMessage.message;
@@ -278,8 +287,8 @@ export default function ChatMessage(props: ChatMessageProps) {
function formatDate(timestamp: string) {
// Format date in HH:MM, DD MMM YYYY format
let date = new Date(timestamp + "Z");
let time_string = date.toLocaleTimeString('en-IN', { hour: '2-digit', minute: '2-digit', hour12: true }).toUpperCase();
let date_string = date.toLocaleString('en-IN', { year: 'numeric', month: 'short', day: '2-digit'}).replaceAll('-', ' ');
let time_string = date.toLocaleTimeString('en-US', { hour: '2-digit', minute: '2-digit', hour12: true }).toUpperCase();
let date_string = date.toLocaleString('en-US', { year: 'numeric', month: 'short', day: '2-digit' }).replaceAll('-', ' ');
return `${time_string} on ${date_string}`;
}
@@ -330,6 +339,79 @@ export default function ChatMessage(props: ChatMessageProps) {
return classes.join(' ');
}
async function playTextToSpeech() {
// Browser native speech API
// const utterance = new SpeechSynthesisUtterance(props.chatMessage.message);
// speechSynthesis.speak(utterance);
// Using the Khoj speech API
// Break the message up into chunks of sentences
const sentenceRegex = /[^.!?]+[.!?]*/g;
const chunks = props.chatMessage.message.match(sentenceRegex) || [];
if (!chunks) {
return;
}
if (chunks.length === 0) {
return;
}
if (!chunks[0]) {
return;
}
setIsPlaying(true);
let nextBlobPromise = fetchBlob(chunks[0]);
for (let i = 0; i < chunks.length; i++) {
if (interruptedRef.current) {
break; // Exit the loop if interrupted
}
const currentBlobPromise = nextBlobPromise;
if (i < chunks.length - 1) {
nextBlobPromise = fetchBlob(chunks[i + 1]);
}
try {
const blob = await currentBlobPromise;
const url = URL.createObjectURL(blob);
await playAudio(url);
} catch (error) {
console.error('Error:', error);
break; // Exit the loop on error
}
}
setIsPlaying(false);
setInterrupted(false); // Reset interrupted state after playback
}
async function fetchBlob(text: string) {
const response = await fetch(`/api/chat/speech?text=${encodeURIComponent(text)}`, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
});
if (!response.ok) {
throw new Error('Network response was not ok');
}
return await response.blob();
}
function playAudio(url: string) {
return new Promise((resolve, reject) => {
const audio = new Audio(url);
audio.onended = resolve;
audio.onerror = reject;
audio.play();
});
}
const allReferences = constructAllReferences(props.chatMessage.context, props.chatMessage.onlineContext);
return (
@@ -349,7 +431,7 @@ export default function ChatMessage(props: ChatMessageProps) {
</div>
<div className={styles.chatFooter}>
{
(isHovering || props.isMobileWidth || props.isLastMessage) &&
(isHovering || props.isMobileWidth || props.isLastMessage || isPlaying) &&
(
<>
<div title={formatDate(props.chatMessage.created)} className={`text-gray-400 relative top-0 left-4`}>
@@ -359,9 +441,17 @@ export default function ChatMessage(props: ChatMessageProps) {
{
(props.chatMessage.by === "khoj") &&
(
<button title="Speak" onClick={(event) => console.log("speaker")}>
<SpeakerHigh alt="Speak Message" color='hsl(var(--muted-foreground))' />
</button>
isPlaying ?
(
interrupted ?
<InlineLoading iconClassName='p-0' className='m-0' />
: <button title="Pause Speech" onClick={(event) => setInterrupted(true)}>
<Pause alt="Pause Message" color='hsl(var(--muted-foreground))' />
</button>
)
: <button title="Speak" onClick={(event) => playTextToSpeech()}>
<SpeakerHigh alt="Speak Message" color='hsl(var(--muted-foreground))' />
</button>
)
}
<button title="Copy" className={`${styles.copyButton}`} onClick={() => {

View File

@@ -2,6 +2,7 @@ import { CircleNotch } from '@phosphor-icons/react';
interface LoadingProps {
className?: string;
iconClassName?: string;
message?: string;
}
@@ -17,7 +18,7 @@ export default function Loading(props: LoadingProps) {
export function InlineLoading(props: LoadingProps) {
return (
<button className={`${props.className}`}>
<span>{props.message} <CircleNotch className="inline animate-spin h-5 w-5 mx-3" /></span>
<span>{props.message} <CircleNotch className={`inline animate-spin ${props.iconClassName ? props.iconClassName : 'h-5 w-5 mx-3'}`}/></span>
</button>
)
}

View File

@@ -9,6 +9,7 @@ import {
import styles from "./suggestions.module.css";
import { getIconFromIconName } from "@/app/common/iconUtils";