Only check content type if file extension cannot identify text file

2026-03-08 05:39:13 +00:00 · 2024-04-12 03:40:42 +05:30
parent 7d2ef728e6
commit 5c7797dbca
2 changed files with 10 additions and 13 deletions
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -51,11 +51,10 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
    def is_plaintextfile(file: str):
        "Check if file is plaintext file"
        # Check if file path exists
-        content_identity = magika.identify_path(Path(file)).output
+        content_group = magika.identify_path(Path(file)).output.group
        if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]:
            return content_identity.group in ["text", "code"]
        # Use file extension to decide plaintext if file content is not identifiable
-        return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
+        valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
        return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
    def extract_html_content(html_content: str):
        "Extract content from HTML"
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -102,22 +102,20 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
    # Infer content type from reading file content
    try:
-        content_identity = magika.identify_bytes(file_content).output
+        content_group = magika.identify_bytes(file_content).output.group
        content_type = content_identity.mime_type
        content_group = content_identity.group
    except Exception:
        # Fallback to using just file type if content type cannot be inferred
-        content_type = file_type
+        content_group = "unknown"
-    if file_type in ["text/markdown"] and content_group in ["code", "text"]:
+    if file_type in ["text/markdown"]:
        return "markdown", encoding
-    elif file_type in ["text/org"] and content_group in ["code", "text"]:
+    elif file_type in ["text/org"]:
        return "org", encoding
-    elif file_type in ["application/pdf"] and content_type == "application/pdf":
+    elif file_type in ["application/pdf"]:
        return "pdf", encoding
-    elif file_type in ["image/jpeg"] and content_type == "image/jpeg":
+    elif file_type in ["image/jpeg"]:
        return "jpeg", encoding
-    elif file_type in ["image/png"] and content_type == "image/png":
+    elif file_type in ["image/png"]:
        return "png", encoding
    elif content_group in ["code", "text"]:
        return "plaintext", encoding