Use encoding of each file set in indexer request to read file

Get encoding type from multi-part/form-request body for each file
Read text files as utf-8 and pdfs, images as binary
This commit is contained in:
Debanjum Singh Solanky
2023-10-17 02:37:20 -07:00
parent 8e627a5809
commit d27dc71dfe
3 changed files with 14 additions and 11 deletions

View File

@@ -93,9 +93,9 @@ function filenameToMimeType (filename) {
case 'png': case 'png':
return 'image/png'; return 'image/png';
case 'jpg': case 'jpg':
return 'image/jpeg';
case 'jpeg': case 'jpeg':
return 'image/jpeg'; return 'image/jpeg';
case 'md':
case 'markdown': case 'markdown':
return 'text/markdown'; return 'text/markdown';
case 'org': case 'org':

View File

@@ -73,7 +73,7 @@ async def index_batch(
plaintext_files: Dict[str, str] = {} plaintext_files: Dict[str, str] = {}
for file in files: for file in files:
file_type = get_file_type(file.content_type) file_type, encoding = get_file_type(file.content_type)
dict_to_update = None dict_to_update = None
if file_type == "org": if file_type == "org":
dict_to_update = org_files dict_to_update = org_files
@@ -85,7 +85,9 @@ async def index_batch(
dict_to_update = plaintext_files dict_to_update = plaintext_files
if dict_to_update is not None: if dict_to_update is not None:
dict_to_update[file.filename] = file.file.read().decode("utf-8") dict_to_update[file.filename] = (
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
)
else: else:
logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}") logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")

View File

@@ -66,24 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
return merged_dict return merged_dict
def get_file_type(file_type: str) -> str: def get_file_type(file_type: str) -> tuple[str, str]:
"Get file type from file mime type" "Get file type from file mime type"
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
if file_type in ["text/markdown"]: if file_type in ["text/markdown"]:
return "markdown" return "markdown", encoding
elif file_type in ["text/org"]: elif file_type in ["text/org"]:
return "org" return "org", encoding
elif file_type in ["application/pdf"]: elif file_type in ["application/pdf"]:
return "pdf" return "pdf", encoding
elif file_type in ["image/jpeg"]: elif file_type in ["image/jpeg"]:
return "jpeg" return "jpeg", encoding
elif file_type in ["image/png"]: elif file_type in ["image/png"]:
return "png" return "png", encoding
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]: elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
return "plaintext" return "plaintext", encoding
else: else:
return "other" return "other", encoding
def load_model( def load_model(