From 7d2ef728e6820441f8c4e591789f217a9c56957b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Fri, 12 Apr 2024 02:49:39 +0530 Subject: [PATCH] Fix identifying pdf files on server Introduced bug in previous commit that would stop indexing PDF files as trying to check content_group instead of mime_type is application/pdf --- src/khoj/utils/helpers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index ada828b3..d14ef62e 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -102,14 +102,16 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]: # Infer content type from reading file content try: - content_type = magika.identify_bytes(file_content).output.group + content_identity = magika.identify_bytes(file_content).output + content_type = content_identity.mime_type + content_group = content_identity.group except Exception: # Fallback to using just file type if content type cannot be inferred content_type = file_type - if file_type in ["text/markdown"] and content_type in ["code", "text"]: + if file_type in ["text/markdown"] and content_group in ["code", "text"]: return "markdown", encoding - elif file_type in ["text/org"] and content_type in ["code", "text"]: + elif file_type in ["text/org"] and content_group in ["code", "text"]: return "org", encoding elif file_type in ["application/pdf"] and content_type == "application/pdf": return "pdf", encoding @@ -117,7 +119,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]: return "jpeg", encoding elif file_type in ["image/png"] and content_type == "image/png": return "png", encoding - elif content_type in ["code", "text"]: + elif content_group in ["code", "text"]: return "plaintext", encoding else: return "other", encoding