From cc77bc4076624cee7084d9c0715ea400d42ebc1c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 21 Nov 2023 20:37:34 -0800
Subject: [PATCH] Create speech to text API endpoint. Use OpenAI whisper for
 ASR

- Wrap audio transcription in try/catch and delete audio file after
processing
- Use configured speech to text model, else handle error
---
 src/khoj/routers/api.py | 53 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index f2e5c966..0d0d4bb1 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -1,13 +1,16 @@
 # Standard Packages
 import concurrent.futures
 import math
+import os
 import time
 import logging
 import json
 from typing import Annotated, List, Optional, Union, Any
+import uuid
 
 # External Packages
-from fastapi import APIRouter, Depends, HTTPException, Header, Request
+from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
+import openai
 from starlette.authentication import requires
 from asgiref.sync import sync_to_async
 
@@ -553,6 +556,54 @@ async def chat_options(
     return Response(content=json.dumps(cmd_options), media_type="application/json", status_code=200)
 
 
+@api.post("/speak")
+@requires(["authenticated"])
+async def transcribe_audio(request: Request, common: CommonQueryParams, file: UploadFile = File(...)):
+    user: KhojUser = request.user.object
+    audio_filename = f"{user.uuid}-{str(uuid.uuid4())}.webm"
+    user_message: str = None
+
+    # Transcribe the audio from the request
+    try:
+        # Store the audio from the request in a temporary file
+        audio_data = await file.read()
+        with open(audio_filename, "wb") as audio_file_writer:
+            audio_file_writer.write(audio_data)
+        audio_file = open(audio_filename, "rb")
+
+        # Send the audio data to the Whisper API
+        speech_to_text_config = await ConversationAdapters.get_speech_to_text_config()
+        openai_chat_config = await ConversationAdapters.get_openai_chat_config()
+        if not openai_chat_config or not speech_to_text_config:
+            # If the user has not configured a speech to text model, return an unprocessable entity error
+            status_code = 422
+        elif speech_to_text_config.model_type == ChatModelOptions.ModelType.OPENAI:
+            api_key = openai_chat_config.api_key
+            speech2text_model = speech_to_text_config.model_name
+            response = await sync_to_async(openai.Audio.translate)(
+                model=speech2text_model, file=audio_file, api_key=api_key
+            )
+            user_message = response["text"]
+    finally:
+        # Close and Delete the temporary audio file
+        audio_file.close()
+        os.remove(audio_filename)
+
+    if user_message is None:
+        return Response(status_code=status_code or 500)
+
+    update_telemetry_state(
+        request=request,
+        telemetry_type="api",
+        api="speech_to_text",
+        **common.__dict__,
+    )
+
+    # Return the spoken text
+    content = json.dumps({"text": user_message})
+    return Response(content=content, media_type="application/json", status_code=200)
+
+
 @api.get("/chat", response_class=Response)
 @requires(["authenticated"])
 async def chat(