From 5c7797dbca8976dab0afcfcff5816a507250726d Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 12 Apr 2024 03:40:42 +0530
Subject: [PATCH] Only check content type if file extension cannot identify
 text file

---
 src/khoj/utils/fs_syncer.py |  7 +++----
 src/khoj/utils/helpers.py   | 16 +++++++---------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index 03123b8b..5a20f418 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -51,11 +51,10 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
     def is_plaintextfile(file: str):
         "Check if file is plaintext file"
         # Check if file path exists
-        content_identity = magika.identify_path(Path(file)).output
-        if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]:
-            return content_identity.group in ["text", "code"]
+        content_group = magika.identify_path(Path(file)).output.group
         # Use file extension to decide plaintext if file content is not identifiable
-        return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
+        valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
+        return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
 
     def extract_html_content(html_content: str):
         "Extract content from HTML"
diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index d14ef62e..f6a66b4f 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -102,22 +102,20 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
 
     # Infer content type from reading file content
     try:
-        content_identity = magika.identify_bytes(file_content).output
-        content_type = content_identity.mime_type
-        content_group = content_identity.group
+        content_group = magika.identify_bytes(file_content).output.group
     except Exception:
         # Fallback to using just file type if content type cannot be inferred
-        content_type = file_type
+        content_group = "unknown"
 
-    if file_type in ["text/markdown"] and content_group in ["code", "text"]:
+    if file_type in ["text/markdown"]:
         return "markdown", encoding
-    elif file_type in ["text/org"] and content_group in ["code", "text"]:
+    elif file_type in ["text/org"]:
         return "org", encoding
-    elif file_type in ["application/pdf"] and content_type == "application/pdf":
+    elif file_type in ["application/pdf"]:
         return "pdf", encoding
-    elif file_type in ["image/jpeg"] and content_type == "image/jpeg":
+    elif file_type in ["image/jpeg"]:
         return "jpeg", encoding
-    elif file_type in ["image/png"] and content_type == "image/png":
+    elif file_type in ["image/png"]:
         return "png", encoding
     elif content_group in ["code", "text"]:
         return "plaintext", encoding