mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Only check content type if file extension cannot identify text file
This commit is contained in:
@@ -51,11 +51,10 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||||||
def is_plaintextfile(file: str):
|
def is_plaintextfile(file: str):
|
||||||
"Check if file is plaintext file"
|
"Check if file is plaintext file"
|
||||||
# Check if file path exists
|
# Check if file path exists
|
||||||
content_identity = magika.identify_path(Path(file)).output
|
content_group = magika.identify_path(Path(file)).output.group
|
||||||
if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]:
|
|
||||||
return content_identity.group in ["text", "code"]
|
|
||||||
# Use file extension to decide plaintext if file content is not identifiable
|
# Use file extension to decide plaintext if file content is not identifiable
|
||||||
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
|
valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
|
||||||
|
return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
|
||||||
|
|
||||||
def extract_html_content(html_content: str):
|
def extract_html_content(html_content: str):
|
||||||
"Extract content from HTML"
|
"Extract content from HTML"
|
||||||
|
|||||||
@@ -102,22 +102,20 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
|||||||
|
|
||||||
# Infer content type from reading file content
|
# Infer content type from reading file content
|
||||||
try:
|
try:
|
||||||
content_identity = magika.identify_bytes(file_content).output
|
content_group = magika.identify_bytes(file_content).output.group
|
||||||
content_type = content_identity.mime_type
|
|
||||||
content_group = content_identity.group
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback to using just file type if content type cannot be inferred
|
# Fallback to using just file type if content type cannot be inferred
|
||||||
content_type = file_type
|
content_group = "unknown"
|
||||||
|
|
||||||
if file_type in ["text/markdown"] and content_group in ["code", "text"]:
|
if file_type in ["text/markdown"]:
|
||||||
return "markdown", encoding
|
return "markdown", encoding
|
||||||
elif file_type in ["text/org"] and content_group in ["code", "text"]:
|
elif file_type in ["text/org"]:
|
||||||
return "org", encoding
|
return "org", encoding
|
||||||
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
elif file_type in ["application/pdf"]:
|
||||||
return "pdf", encoding
|
return "pdf", encoding
|
||||||
elif file_type in ["image/jpeg"] and content_type == "image/jpeg":
|
elif file_type in ["image/jpeg"]:
|
||||||
return "jpeg", encoding
|
return "jpeg", encoding
|
||||||
elif file_type in ["image/png"] and content_type == "image/png":
|
elif file_type in ["image/png"]:
|
||||||
return "png", encoding
|
return "png", encoding
|
||||||
elif content_group in ["code", "text"]:
|
elif content_group in ["code", "text"]:
|
||||||
return "plaintext", encoding
|
return "plaintext", encoding
|
||||||
|
|||||||
Reference in New Issue
Block a user