Fix identifying pdf files on server

Introduced bug in previous commit that would stop indexing PDF files as trying to check content_group instead of mime_type is application/pdf
2026-03-02 13:18:18 +00:00 · 2024-04-12 02:49:39 +05:30
parent 07f8fb5c5b
commit 7d2ef728e6
1 changed files with 6 additions and 4 deletions
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -102,14 +102,16 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:

    # Infer content type from reading file content
    try:
-        content_type = magika.identify_bytes(file_content).output.group
+        content_identity = magika.identify_bytes(file_content).output
+        content_type = content_identity.mime_type
+        content_group = content_identity.group
    except Exception:
        # Fallback to using just file type if content type cannot be inferred
        content_type = file_type

-    if file_type in ["text/markdown"] and content_type in ["code", "text"]:
+    if file_type in ["text/markdown"] and content_group in ["code", "text"]:
        return "markdown", encoding
-    elif file_type in ["text/org"] and content_type in ["code", "text"]:
+    elif file_type in ["text/org"] and content_group in ["code", "text"]:
        return "org", encoding
    elif file_type in ["application/pdf"] and content_type == "application/pdf":
        return "pdf", encoding
@@ -117,7 +119,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
        return "jpeg", encoding
    elif file_type in ["image/png"] and content_type == "image/png":
        return "png", encoding
-    elif content_type in ["code", "text"]:
+    elif content_group in ["code", "text"]:
        return "plaintext", encoding
    else:
        return "other", encoding