mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-07 21:29:13 +00:00
Make identifying text, code files with Magika more robust on server
Use identified content group rather than mime_type to find text files.
This commit is contained in:
@@ -154,16 +154,18 @@ class GithubToEntries(TextToEntries):
|
|||||||
content_bytes = self.get_file_contents(item["url"], decode=False)
|
content_bytes = self.get_file_contents(item["url"], decode=False)
|
||||||
content_type, content_str = None, None
|
content_type, content_str = None, None
|
||||||
try:
|
try:
|
||||||
content_type = magika.identify_bytes(content_bytes).output.mime_type
|
content_type = magika.identify_bytes(content_bytes).output.group
|
||||||
content_str = content_bytes.decode("utf-8")
|
|
||||||
except:
|
except:
|
||||||
logger.error(
|
logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it")
|
||||||
f"Unable to identify content type or decode content of file at {url_path}. Skip indexing it"
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Add non-binary file contents and URL to list
|
# Add non-binary file contents and URL to list
|
||||||
if content_type.startswith("text/"):
|
if content_type in ["text", "code"]:
|
||||||
|
try:
|
||||||
|
content_str = content_bytes.decode("utf-8")
|
||||||
|
except:
|
||||||
|
logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it")
|
||||||
|
continue
|
||||||
plaintext_files += [{"content": content_str, "path": url_path}]
|
plaintext_files += [{"content": content_str, "path": url_path}]
|
||||||
|
|
||||||
return markdown_files, org_files, plaintext_files
|
return markdown_files, org_files, plaintext_files
|
||||||
|
|||||||
@@ -51,9 +51,9 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||||||
def is_plaintextfile(file: str):
|
def is_plaintextfile(file: str):
|
||||||
"Check if file is plaintext file"
|
"Check if file is plaintext file"
|
||||||
# Check if file path exists
|
# Check if file path exists
|
||||||
mime_type = magika.identify_path(Path(file)).output.mime_type
|
content_identity = magika.identify_path(Path(file)).output
|
||||||
if mime_type != "inode/x-empty" and mime_type != "application/unknown":
|
if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]:
|
||||||
return mime_type.startswith("text/")
|
return content_identity.group in ["text", "code"]
|
||||||
# Use file extension to decide plaintext if file content is not identifiable
|
# Use file extension to decide plaintext if file content is not identifiable
|
||||||
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
|
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
|
||||||
|
|
||||||
|
|||||||
@@ -102,14 +102,14 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
|||||||
|
|
||||||
# Infer content type from reading file content
|
# Infer content type from reading file content
|
||||||
try:
|
try:
|
||||||
content_type = magika.identify_bytes(file_content).output.mime_type
|
content_type = magika.identify_bytes(file_content).output.group
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback to using just file type if content type cannot be inferred
|
# Fallback to using just file type if content type cannot be inferred
|
||||||
content_type = file_type
|
content_type = file_type
|
||||||
|
|
||||||
if file_type in ["text/markdown"] and content_type.startswith("text/"):
|
if file_type in ["text/markdown"] and content_type in ["code", "text"]:
|
||||||
return "markdown", encoding
|
return "markdown", encoding
|
||||||
elif file_type in ["text/org"] and content_type.startswith("text/"):
|
elif file_type in ["text/org"] and content_type in ["code", "text"]:
|
||||||
return "org", encoding
|
return "org", encoding
|
||||||
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
||||||
return "pdf", encoding
|
return "pdf", encoding
|
||||||
@@ -117,7 +117,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
|||||||
return "jpeg", encoding
|
return "jpeg", encoding
|
||||||
elif file_type in ["image/png"] and content_type == "image/png":
|
elif file_type in ["image/png"] and content_type == "image/png":
|
||||||
return "png", encoding
|
return "png", encoding
|
||||||
elif content_type.startswith("text/"):
|
elif content_type in ["code", "text"]:
|
||||||
return "plaintext", encoding
|
return "plaintext", encoding
|
||||||
else:
|
else:
|
||||||
return "other", encoding
|
return "other", encoding
|
||||||
|
|||||||
Reference in New Issue
Block a user