From 5c7797dbca8976dab0afcfcff5816a507250726d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 12 Apr 2024 03:40:42 +0530 Subject: [PATCH] Only check content type if file extension cannot identify text file --- src/khoj/utils/fs_syncer.py | 7 +++---- src/khoj/utils/helpers.py | 16 +++++++--------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index 03123b8b..5a20f418 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -51,11 +51,10 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: def is_plaintextfile(file: str): "Check if file is plaintext file" # Check if file path exists - content_identity = magika.identify_path(Path(file)).output - if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]: - return content_identity.group in ["text", "code"] + content_group = magika.identify_path(Path(file)).output.group # Use file extension to decide plaintext if file content is not identifiable - return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml")) + valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml") + return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"] def extract_html_content(html_content: str): "Extract content from HTML" diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index d14ef62e..f6a66b4f 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -102,22 +102,20 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]: # Infer content type from reading file content try: - content_identity = magika.identify_bytes(file_content).output - content_type = content_identity.mime_type - content_group = content_identity.group + content_group = magika.identify_bytes(file_content).output.group except Exception: # Fallback to using just file type if content type cannot be inferred - content_type = file_type + content_group = "unknown" - if file_type in ["text/markdown"] and content_group in ["code", "text"]: + if file_type in ["text/markdown"]: return "markdown", encoding - elif file_type in ["text/org"] and content_group in ["code", "text"]: + elif file_type in ["text/org"]: return "org", encoding - elif file_type in ["application/pdf"] and content_type == "application/pdf": + elif file_type in ["application/pdf"]: return "pdf", encoding - elif file_type in ["image/jpeg"] and content_type == "image/jpeg": + elif file_type in ["image/jpeg"]: return "jpeg", encoding - elif file_type in ["image/png"] and content_type == "image/png": + elif file_type in ["image/png"]: return "png", encoding elif content_group in ["code", "text"]: return "plaintext", encoding