diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index 8fc1e18b..02fa4cf0 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -154,16 +154,18 @@ class GithubToEntries(TextToEntries): content_bytes = self.get_file_contents(item["url"], decode=False) content_type, content_str = None, None try: - content_type = magika.identify_bytes(content_bytes).output.mime_type - content_str = content_bytes.decode("utf-8") + content_type = magika.identify_bytes(content_bytes).output.group except: - logger.error( - f"Unable to identify content type or decode content of file at {url_path}. Skip indexing it" - ) + logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it") continue # Add non-binary file contents and URL to list - if content_type.startswith("text/"): + if content_type in ["text", "code"]: + try: + content_str = content_bytes.decode("utf-8") + except: + logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it") + continue plaintext_files += [{"content": content_str, "path": url_path}] return markdown_files, org_files, plaintext_files diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index f9b7fc62..03123b8b 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -51,9 +51,9 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: def is_plaintextfile(file: str): "Check if file is plaintext file" # Check if file path exists - mime_type = magika.identify_path(Path(file)).output.mime_type - if mime_type != "inode/x-empty" and mime_type != "application/unknown": - return mime_type.startswith("text/") + content_identity = magika.identify_path(Path(file)).output + if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]: + return content_identity.group in ["text", "code"] # Use file extension to decide plaintext if file content is not identifiable return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml")) diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index d5bf9b4b..ada828b3 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -102,14 +102,14 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]: # Infer content type from reading file content try: - content_type = magika.identify_bytes(file_content).output.mime_type + content_type = magika.identify_bytes(file_content).output.group except Exception: # Fallback to using just file type if content type cannot be inferred content_type = file_type - if file_type in ["text/markdown"] and content_type.startswith("text/"): + if file_type in ["text/markdown"] and content_type in ["code", "text"]: return "markdown", encoding - elif file_type in ["text/org"] and content_type.startswith("text/"): + elif file_type in ["text/org"] and content_type in ["code", "text"]: return "org", encoding elif file_type in ["application/pdf"] and content_type == "application/pdf": return "pdf", encoding @@ -117,7 +117,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]: return "jpeg", encoding elif file_type in ["image/png"] and content_type == "image/png": return "png", encoding - elif content_type.startswith("text/"): + elif content_type in ["code", "text"]: return "plaintext", encoding else: return "other", encoding