From fd15fc1e598b3dc7f720e86cb336c3ae1199b66f Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 11 Nov 2024 03:23:58 -0800 Subject: [PATCH 1/3] Move construct chat history back to it's original position in file Keep function where it original was allows tracking diffs and change history more easily --- src/khoj/processor/conversation/utils.py | 58 ++++++++++++------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index b46e60ed..7fe83d06 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -140,6 +140,35 @@ def construct_iteration_history( return previous_iterations_history +def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str: + chat_history = "" + for chat in conversation_history.get("chat", [])[-n:]: + if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]: + chat_history += f"User: {chat['intent']['query']}\n" + + if chat["intent"].get("inferred-queries"): + chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n' + + chat_history += f"{agent_name}: {chat['message']}\n\n" + elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")): + chat_history += f"User: {chat['intent']['query']}\n" + chat_history += f"{agent_name}: [generated image redacted for space]\n" + elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")): + chat_history += f"User: {chat['intent']['query']}\n" + chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n" + elif chat["by"] == "you": + raw_attached_files = chat.get("attachedFiles") + if raw_attached_files: + attached_files: Dict[str, str] = {} + for file in raw_attached_files: + attached_files[file["name"]] = file["content"] + + attached_file_context = gather_raw_attached_files(attached_files) + chat_history += f"User: {attached_file_context}\n" + + return chat_history + + def construct_tool_chat_history( previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None ) -> Dict[str, list]: @@ -540,35 +569,6 @@ def get_image_from_url(image_url: str, type="pil"): return ImageWithType(content=None, type=None) -def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str: - chat_history = "" - for chat in conversation_history.get("chat", [])[-n:]: - if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]: - chat_history += f"User: {chat['intent']['query']}\n" - - if chat["intent"].get("inferred-queries"): - chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n' - - chat_history += f"{agent_name}: {chat['message']}\n\n" - elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")): - chat_history += f"User: {chat['intent']['query']}\n" - chat_history += f"{agent_name}: [generated image redacted for space]\n" - elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")): - chat_history += f"User: {chat['intent']['query']}\n" - chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n" - elif chat["by"] == "you": - raw_attached_files = chat.get("attachedFiles") - if raw_attached_files: - attached_files: Dict[str, str] = {} - for file in raw_attached_files: - attached_files[file["name"]] = file["content"] - - attached_file_context = gather_raw_attached_files(attached_files) - chat_history += f"User: {attached_file_context}\n" - - return chat_history - - def commit_conversation_trace( session: list[ChatMessage], response: str | list[dict], From 4223b355dcc065de20d5d282edf5e6c91891977e Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 11 Nov 2024 03:20:35 -0800 Subject: [PATCH 2/3] Use python stdlib methods to write pdf, docx to temp files for loaders Use python standard method tempfile.NamedTemporaryFile to write, delete temporary files safely. --- .../processor/content/docx/docx_to_entries.py | 28 +++++--------- .../processor/content/pdf/pdf_to_entries.py | 37 +++++++------------ 2 files changed, 23 insertions(+), 42 deletions(-) diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py index 55dd8bac..19d9ba13 100644 --- a/src/khoj/processor/content/docx/docx_to_entries.py +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -1,7 +1,5 @@ import logging -import os -from datetime import datetime -from random import randint +import tempfile from typing import Dict, List, Tuple from langchain_community.document_loaders import Docx2txtLoader @@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries): def extract_text(docx_file): """Extract text from specified DOCX file""" try: - timestamp_now = datetime.utcnow().timestamp() - random_suffix = randint(0, 1000) - tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx" docx_entry_by_pages = [] - with open(tmp_file, "wb") as f: - bytes_content = docx_file - f.write(bytes_content) + # Create temp file with .docx extension that gets auto-deleted + with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp: + tmp.write(docx_file) + tmp.flush() # Ensure all data is written - # Load the content using Docx2txtLoader - loader = Docx2txtLoader(tmp_file) - docx_entries_per_file = loader.load() - - # Convert the loaded entries into the desired format - docx_entry_by_pages = [page.page_content for page in docx_entries_per_file] + # Load the content using Docx2txtLoader + loader = Docx2txtLoader(tmp.name) + docx_entries_per_file = loader.load() + # Convert the loaded entries into the desired format + docx_entry_by_pages = [page.page_content for page in docx_entries_per_file] except Exception as e: logger.warning(f"Unable to extract text from file: {docx_file}") logger.warning(e, exc_info=True) - finally: - if os.path.exists(f"{tmp_file}"): - os.remove(f"{tmp_file}") return docx_entry_by_pages diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 311ac807..39685996 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -1,14 +1,10 @@ -import base64 import logging -import os -from datetime import datetime -from random import randint +import tempfile +from io import BytesIO from typing import Dict, List, Tuple from langchain_community.document_loaders import PyMuPDFLoader -# importing FileObjectAdapter so that we can add new files and debug file object db. -# from khoj.database.adapters import FileObjectAdapters from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser from khoj.processor.content.text_to_entries import TextToEntries @@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries): def extract_text(pdf_file): """Extract text from specified PDF files""" try: - # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path - timestamp_now = datetime.utcnow().timestamp() - random_suffix = randint(0, 1000) - tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf" - pdf_entry_by_pages = [] - with open(f"{tmp_file}", "wb") as f: - f.write(pdf_file) - try: - loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False) - pdf_entry_by_pages = [page.page_content for page in loader.load()] - except ImportError: - loader = PyMuPDFLoader(f"{tmp_file}") - pdf_entry_by_pages = [ - page.page_content for page in loader.load() - ] # page_content items list for a given pdf. + # Create temp file with .pdf extension that gets auto-deleted + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf: + tmpf.write(pdf_file) + tmpf.flush() # Ensure all data is written + + # Load the content using PyMuPDFLoader + loader = PyMuPDFLoader(tmpf.name, extract_images=True) + pdf_entries_per_file = loader.load() + + # Convert the loaded entries into the desired format + pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file] except Exception as e: logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") logger.warning(e, exc_info=True) - finally: - if os.path.exists(f"{tmp_file}"): - os.remove(f"{tmp_file}") return pdf_entry_by_pages From 7954f39633c8429ddbb9b4535ce1cc37e5f0c36e Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 11 Nov 2024 04:06:17 -0800 Subject: [PATCH 3/3] Use accept param to file input to indicate supported file types in web app Remove unused total size calculations in chat input --- .../web/app/components/chatInputArea/chatInputArea.tsx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx index 49157f56..8b276562 100644 --- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx +++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx @@ -257,9 +257,6 @@ export const ChatInputArea = forwardRef((pr setConvertedAttachedFiles(data); }); - const totalSize = Array.from(files).reduce((acc, file) => acc + file.size, 0); - const totalSizeInMB = totalSize / (1024 * 1024); - // Set focus to the input for user message after uploading files chatInputRef?.current?.focus(); } @@ -612,6 +609,7 @@ export const ChatInputArea = forwardRef((pr >