From 7d2ef728e6820441f8c4e591789f217a9c56957b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 12 Apr 2024 02:49:39 +0530
Subject: [PATCH] Fix identifying pdf files on server

Introduced bug in previous commit that would stop indexing PDF files
as trying to check content_group instead of mime_type is application/pdf
---
 src/khoj/utils/helpers.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index ada828b3..d14ef62e 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -102,14 +102,16 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
 
     # Infer content type from reading file content
     try:
-        content_type = magika.identify_bytes(file_content).output.group
+        content_identity = magika.identify_bytes(file_content).output
+        content_type = content_identity.mime_type
+        content_group = content_identity.group
     except Exception:
         # Fallback to using just file type if content type cannot be inferred
         content_type = file_type
 
-    if file_type in ["text/markdown"] and content_type in ["code", "text"]:
+    if file_type in ["text/markdown"] and content_group in ["code", "text"]:
         return "markdown", encoding
-    elif file_type in ["text/org"] and content_type in ["code", "text"]:
+    elif file_type in ["text/org"] and content_group in ["code", "text"]:
         return "org", encoding
     elif file_type in ["application/pdf"] and content_type == "application/pdf":
         return "pdf", encoding
@@ -117,7 +119,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
         return "jpeg", encoding
     elif file_type in ["image/png"] and content_type == "image/png":
         return "png", encoding
-    elif content_type in ["code", "text"]:
+    elif content_group in ["code", "text"]:
         return "plaintext", encoding
     else:
         return "other", encoding