Make identifying text, code files with Magika more robust on server

Use identified content group rather than mime_type to find text files.
This commit is contained in:
Debanjum Singh Solanky
2024-04-12 02:07:32 +05:30
parent 60337086f9
commit a7d9102c33
3 changed files with 15 additions and 13 deletions

View File

@@ -154,16 +154,18 @@ class GithubToEntries(TextToEntries):
content_bytes = self.get_file_contents(item["url"], decode=False) content_bytes = self.get_file_contents(item["url"], decode=False)
content_type, content_str = None, None content_type, content_str = None, None
try: try:
content_type = magika.identify_bytes(content_bytes).output.mime_type content_type = magika.identify_bytes(content_bytes).output.group
content_str = content_bytes.decode("utf-8")
except: except:
logger.error( logger.error(f"Unable to identify content type of file at {url_path}. Skip indexing it")
f"Unable to identify content type or decode content of file at {url_path}. Skip indexing it"
)
continue continue
# Add non-binary file contents and URL to list # Add non-binary file contents and URL to list
if content_type.startswith("text/"): if content_type in ["text", "code"]:
try:
content_str = content_bytes.decode("utf-8")
except:
logger.error(f"Unable to decode content of file at {url_path}. Skip indexing it")
continue
plaintext_files += [{"content": content_str, "path": url_path}] plaintext_files += [{"content": content_str, "path": url_path}]
return markdown_files, org_files, plaintext_files return markdown_files, org_files, plaintext_files

View File

@@ -51,9 +51,9 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
def is_plaintextfile(file: str): def is_plaintextfile(file: str):
"Check if file is plaintext file" "Check if file is plaintext file"
# Check if file path exists # Check if file path exists
mime_type = magika.identify_path(Path(file)).output.mime_type content_identity = magika.identify_path(Path(file)).output
if mime_type != "inode/x-empty" and mime_type != "application/unknown": if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]:
return mime_type.startswith("text/") return content_identity.group in ["text", "code"]
# Use file extension to decide plaintext if file content is not identifiable # Use file extension to decide plaintext if file content is not identifiable
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml")) return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))

View File

@@ -102,14 +102,14 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
# Infer content type from reading file content # Infer content type from reading file content
try: try:
content_type = magika.identify_bytes(file_content).output.mime_type content_type = magika.identify_bytes(file_content).output.group
except Exception: except Exception:
# Fallback to using just file type if content type cannot be inferred # Fallback to using just file type if content type cannot be inferred
content_type = file_type content_type = file_type
if file_type in ["text/markdown"] and content_type.startswith("text/"): if file_type in ["text/markdown"] and content_type in ["code", "text"]:
return "markdown", encoding return "markdown", encoding
elif file_type in ["text/org"] and content_type.startswith("text/"): elif file_type in ["text/org"] and content_type in ["code", "text"]:
return "org", encoding return "org", encoding
elif file_type in ["application/pdf"] and content_type == "application/pdf": elif file_type in ["application/pdf"] and content_type == "application/pdf":
return "pdf", encoding return "pdf", encoding
@@ -117,7 +117,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
return "jpeg", encoding return "jpeg", encoding
elif file_type in ["image/png"] and content_type == "image/png": elif file_type in ["image/png"] and content_type == "image/png":
return "png", encoding return "png", encoding
elif content_type.startswith("text/"): elif content_type in ["code", "text"]:
return "plaintext", encoding return "plaintext", encoding
else: else:
return "other", encoding return "other", encoding