From fd15fc1e598b3dc7f720e86cb336c3ae1199b66f Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 11 Nov 2024 03:23:58 -0800
Subject: [PATCH 1/3] Move construct chat history back to it's original
 position in file

Keep function where it original was allows tracking diffs and change
history more easily
---
 src/khoj/processor/conversation/utils.py | 58 ++++++++++++------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index b46e60ed..7fe83d06 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -140,6 +140,35 @@ def construct_iteration_history(
     return previous_iterations_history
 
 
+def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str:
+    chat_history = ""
+    for chat in conversation_history.get("chat", [])[-n:]:
+        if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
+            chat_history += f"User: {chat['intent']['query']}\n"
+
+            if chat["intent"].get("inferred-queries"):
+                chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
+
+            chat_history += f"{agent_name}: {chat['message']}\n\n"
+        elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
+            chat_history += f"User: {chat['intent']['query']}\n"
+            chat_history += f"{agent_name}: [generated image redacted for space]\n"
+        elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
+            chat_history += f"User: {chat['intent']['query']}\n"
+            chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
+        elif chat["by"] == "you":
+            raw_attached_files = chat.get("attachedFiles")
+            if raw_attached_files:
+                attached_files: Dict[str, str] = {}
+                for file in raw_attached_files:
+                    attached_files[file["name"]] = file["content"]
+
+                attached_file_context = gather_raw_attached_files(attached_files)
+                chat_history += f"User: {attached_file_context}\n"
+
+    return chat_history
+
+
 def construct_tool_chat_history(
     previous_iterations: List[InformationCollectionIteration], tool: ConversationCommand = None
 ) -> Dict[str, list]:
@@ -540,35 +569,6 @@ def get_image_from_url(image_url: str, type="pil"):
         return ImageWithType(content=None, type=None)
 
 
-def construct_chat_history(conversation_history: dict, n: int = 4, agent_name="AI") -> str:
-    chat_history = ""
-    for chat in conversation_history.get("chat", [])[-n:]:
-        if chat["by"] == "khoj" and chat["intent"].get("type") in ["remember", "reminder", "summarize"]:
-            chat_history += f"User: {chat['intent']['query']}\n"
-
-            if chat["intent"].get("inferred-queries"):
-                chat_history += f'{agent_name}: {{"queries": {chat["intent"].get("inferred-queries")}}}\n'
-
-            chat_history += f"{agent_name}: {chat['message']}\n\n"
-        elif chat["by"] == "khoj" and ("text-to-image" in chat["intent"].get("type")):
-            chat_history += f"User: {chat['intent']['query']}\n"
-            chat_history += f"{agent_name}: [generated image redacted for space]\n"
-        elif chat["by"] == "khoj" and ("excalidraw" in chat["intent"].get("type")):
-            chat_history += f"User: {chat['intent']['query']}\n"
-            chat_history += f"{agent_name}: {chat['intent']['inferred-queries'][0]}\n"
-        elif chat["by"] == "you":
-            raw_attached_files = chat.get("attachedFiles")
-            if raw_attached_files:
-                attached_files: Dict[str, str] = {}
-                for file in raw_attached_files:
-                    attached_files[file["name"]] = file["content"]
-
-                attached_file_context = gather_raw_attached_files(attached_files)
-                chat_history += f"User: {attached_file_context}\n"
-
-    return chat_history
-
-
 def commit_conversation_trace(
     session: list[ChatMessage],
     response: str | list[dict],

From 4223b355dcc065de20d5d282edf5e6c91891977e Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 11 Nov 2024 03:20:35 -0800
Subject: [PATCH 2/3] Use python stdlib methods to write pdf, docx to temp
 files for loaders

Use python standard method tempfile.NamedTemporaryFile to write,
delete temporary files safely.
---
 .../processor/content/docx/docx_to_entries.py | 28 +++++---------
 .../processor/content/pdf/pdf_to_entries.py   | 37 +++++++------------
 2 files changed, 23 insertions(+), 42 deletions(-)

diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py
index 55dd8bac..19d9ba13 100644
--- a/src/khoj/processor/content/docx/docx_to_entries.py
+++ b/src/khoj/processor/content/docx/docx_to_entries.py
@@ -1,7 +1,5 @@
 import logging
-import os
-from datetime import datetime
-from random import randint
+import tempfile
 from typing import Dict, List, Tuple
 
 from langchain_community.document_loaders import Docx2txtLoader
@@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries):
     def extract_text(docx_file):
         """Extract text from specified DOCX file"""
         try:
-            timestamp_now = datetime.utcnow().timestamp()
-            random_suffix = randint(0, 1000)
-            tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
             docx_entry_by_pages = []
-            with open(tmp_file, "wb") as f:
-                bytes_content = docx_file
-                f.write(bytes_content)
+            # Create temp file with .docx extension that gets auto-deleted
+            with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
+                tmp.write(docx_file)
+                tmp.flush()  # Ensure all data is written
 
-            # Load the content using Docx2txtLoader
-            loader = Docx2txtLoader(tmp_file)
-            docx_entries_per_file = loader.load()
-
-            # Convert the loaded entries into the desired format
-            docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
+                # Load the content using Docx2txtLoader
+                loader = Docx2txtLoader(tmp.name)
+                docx_entries_per_file = loader.load()
 
+                # Convert the loaded entries into the desired format
+                docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
         except Exception as e:
             logger.warning(f"Unable to extract text from file: {docx_file}")
             logger.warning(e, exc_info=True)
-        finally:
-            if os.path.exists(f"{tmp_file}"):
-                os.remove(f"{tmp_file}")
 
         return docx_entry_by_pages
diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py
index 311ac807..39685996 100644
--- a/src/khoj/processor/content/pdf/pdf_to_entries.py
+++ b/src/khoj/processor/content/pdf/pdf_to_entries.py
@@ -1,14 +1,10 @@
-import base64
 import logging
-import os
-from datetime import datetime
-from random import randint
+import tempfile
+from io import BytesIO
 from typing import Dict, List, Tuple
 
 from langchain_community.document_loaders import PyMuPDFLoader
 
-# importing FileObjectAdapter so that we can add new files and debug file object db.
-# from khoj.database.adapters import FileObjectAdapters
 from khoj.database.models import Entry as DbEntry
 from khoj.database.models import KhojUser
 from khoj.processor.content.text_to_entries import TextToEntries
@@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries):
     def extract_text(pdf_file):
         """Extract text from specified PDF files"""
         try:
-            # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
-            timestamp_now = datetime.utcnow().timestamp()
-            random_suffix = randint(0, 1000)
-            tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf"
-            pdf_entry_by_pages = []
-            with open(f"{tmp_file}", "wb") as f:
-                f.write(pdf_file)
-            try:
-                loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
-                pdf_entry_by_pages = [page.page_content for page in loader.load()]
-            except ImportError:
-                loader = PyMuPDFLoader(f"{tmp_file}")
-                pdf_entry_by_pages = [
-                    page.page_content for page in loader.load()
-                ]  # page_content items list for a given pdf.
+            # Create temp file with .pdf extension that gets auto-deleted
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf:
+                tmpf.write(pdf_file)
+                tmpf.flush()  # Ensure all data is written
+
+                # Load the content using PyMuPDFLoader
+                loader = PyMuPDFLoader(tmpf.name, extract_images=True)
+                pdf_entries_per_file = loader.load()
+
+                # Convert the loaded entries into the desired format
+                pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file]
         except Exception as e:
             logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
             logger.warning(e, exc_info=True)
-        finally:
-            if os.path.exists(f"{tmp_file}"):
-                os.remove(f"{tmp_file}")
 
         return pdf_entry_by_pages

From 7954f39633c8429ddbb9b4535ce1cc37e5f0c36e Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 11 Nov 2024 04:06:17 -0800
Subject: [PATCH 3/3] Use accept param to file input to indicate supported file
 types in web app

Remove unused total size calculations in chat input
---
 .../web/app/components/chatInputArea/chatInputArea.tsx        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
index 49157f56..8b276562 100644
--- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
+++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
@@ -257,9 +257,6 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
             setConvertedAttachedFiles(data);
         });
 
-        const totalSize = Array.from(files).reduce((acc, file) => acc + file.size, 0);
-        const totalSizeInMB = totalSize / (1024 * 1024);
-
         // Set focus to the input for user message after uploading files
         chatInputRef?.current?.focus();
     }
@@ -612,6 +609,7 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
                 >
                     <input
                         type="file"
+                        accept=".pdf,.doc,.docx,.txt,.md,.org,.jpg,.jpeg,.png,.webp"
                         multiple={true}
                         ref={fileInputRef}
                         onChange={handleFileChange}