mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-08 05:39:13 +00:00
Use encoding of each file set in indexer request to read file
Get encoding type from multi-part/form-request body for each file Read text files as utf-8 and pdfs, images as binary
This commit is contained in:
@@ -93,9 +93,9 @@ function filenameToMimeType (filename) {
|
|||||||
case 'png':
|
case 'png':
|
||||||
return 'image/png';
|
return 'image/png';
|
||||||
case 'jpg':
|
case 'jpg':
|
||||||
return 'image/jpeg';
|
|
||||||
case 'jpeg':
|
case 'jpeg':
|
||||||
return 'image/jpeg';
|
return 'image/jpeg';
|
||||||
|
case 'md':
|
||||||
case 'markdown':
|
case 'markdown':
|
||||||
return 'text/markdown';
|
return 'text/markdown';
|
||||||
case 'org':
|
case 'org':
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ async def index_batch(
|
|||||||
plaintext_files: Dict[str, str] = {}
|
plaintext_files: Dict[str, str] = {}
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
file_type = get_file_type(file.content_type)
|
file_type, encoding = get_file_type(file.content_type)
|
||||||
dict_to_update = None
|
dict_to_update = None
|
||||||
if file_type == "org":
|
if file_type == "org":
|
||||||
dict_to_update = org_files
|
dict_to_update = org_files
|
||||||
@@ -85,7 +85,9 @@ async def index_batch(
|
|||||||
dict_to_update = plaintext_files
|
dict_to_update = plaintext_files
|
||||||
|
|
||||||
if dict_to_update is not None:
|
if dict_to_update is not None:
|
||||||
dict_to_update[file.filename] = file.file.read().decode("utf-8")
|
dict_to_update[file.filename] = (
|
||||||
|
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")
|
logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")
|
||||||
|
|
||||||
|
|||||||
@@ -66,24 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
|
|||||||
return merged_dict
|
return merged_dict
|
||||||
|
|
||||||
|
|
||||||
def get_file_type(file_type: str) -> str:
|
def get_file_type(file_type: str) -> tuple[str, str]:
|
||||||
"Get file type from file mime type"
|
"Get file type from file mime type"
|
||||||
|
|
||||||
|
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
|
||||||
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
|
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
|
||||||
if file_type in ["text/markdown"]:
|
if file_type in ["text/markdown"]:
|
||||||
return "markdown"
|
return "markdown", encoding
|
||||||
elif file_type in ["text/org"]:
|
elif file_type in ["text/org"]:
|
||||||
return "org"
|
return "org", encoding
|
||||||
elif file_type in ["application/pdf"]:
|
elif file_type in ["application/pdf"]:
|
||||||
return "pdf"
|
return "pdf", encoding
|
||||||
elif file_type in ["image/jpeg"]:
|
elif file_type in ["image/jpeg"]:
|
||||||
return "jpeg"
|
return "jpeg", encoding
|
||||||
elif file_type in ["image/png"]:
|
elif file_type in ["image/png"]:
|
||||||
return "png"
|
return "png", encoding
|
||||||
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
|
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
|
||||||
return "plaintext"
|
return "plaintext", encoding
|
||||||
else:
|
else:
|
||||||
return "other"
|
return "other", encoding
|
||||||
|
|
||||||
|
|
||||||
def load_model(
|
def load_model(
|
||||||
|
|||||||
Reference in New Issue
Block a user