Remove unsuppported NUL char from file, chat before save to DB

This commit is contained in:
Debanjum
2025-06-03 20:44:26 -07:00
parent 27534f6533
commit 4892e73323
3 changed files with 40 additions and 8 deletions

View File

@@ -72,6 +72,8 @@ from khoj.search_filter.word_filter import WordFilter
from khoj.utils import state
from khoj.utils.config import OfflineChatProcessorModel
from khoj.utils.helpers import (
clean_object_for_db,
clean_text_for_db,
generate_random_internal_agent_name,
generate_random_name,
in_debug_mode,
@@ -1032,7 +1034,7 @@ class ConversationAdapters:
user=user, client=client_application, id=conversation_id
).afirst()
if conversation:
conversation.title = title
conversation.title = clean_text_for_db(title)
await conversation.asave()
return conversation
return None
@@ -1432,14 +1434,15 @@ class ConversationAdapters:
await Conversation.objects.filter(user=user, client=client_application).order_by("-updated_at").afirst()
)
cleaned_conversation_log = clean_object_for_db(conversation_log)
if conversation:
conversation.conversation_log = conversation_log
conversation.conversation_log = cleaned_conversation_log
conversation.slug = slug
conversation.updated_at = django_timezone.now()
await conversation.asave()
else:
await Conversation.objects.acreate(
user=user, conversation_log=conversation_log, client=client_application, slug=slug
user=user, conversation_log=cleaned_conversation_log, client=client_application, slug=slug
)
@staticmethod
@@ -1610,6 +1613,7 @@ class ConversationAdapters:
conversation_log = conversation.conversation_log
updated_log = [msg for msg in conversation_log["chat"] if msg.get("turnId") != turn_id]
conversation.conversation_log["chat"] = updated_log
conversation.conversation_log = clean_object_for_db(conversation.conversation_log)
conversation.save()
return True
@@ -1617,13 +1621,15 @@ class ConversationAdapters:
class FileObjectAdapters:
@staticmethod
def update_raw_text(file_object: FileObject, new_raw_text: str):
file_object.raw_text = new_raw_text
cleaned_raw_text = clean_text_for_db(new_raw_text)
file_object.raw_text = cleaned_raw_text
file_object.save()
@staticmethod
@require_valid_user
def create_file_object(user: KhojUser, file_name: str, raw_text: str):
return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text)
cleaned_raw_text = clean_text_for_db(raw_text)
return FileObject.objects.create(user=user, file_name=file_name, raw_text=cleaned_raw_text)
@staticmethod
@require_valid_user
@@ -1647,13 +1653,15 @@ class FileObjectAdapters:
@staticmethod
async def aupdate_raw_text(file_object: FileObject, new_raw_text: str):
file_object.raw_text = new_raw_text
cleaned_raw_text = clean_text_for_db(new_raw_text)
file_object.raw_text = cleaned_raw_text
await file_object.asave()
@staticmethod
@arequire_valid_user
async def acreate_file_object(user: KhojUser, file_name: str, raw_text: str):
return await FileObject.objects.acreate(user=user, file_name=file_name, raw_text=raw_text)
cleaned_raw_text = clean_text_for_db(raw_text)
return await FileObject.objects.acreate(user=user, file_name=file_name, raw_text=cleaned_raw_text)
@staticmethod
@arequire_valid_user

View File

@@ -71,6 +71,7 @@ from khoj.routers.storage import upload_user_image_to_bucket
from khoj.utils import state
from khoj.utils.helpers import (
ConversationCommand,
clean_text_for_db,
command_descriptions,
convert_image_to_webp,
get_country_code_from_timezone,
@@ -631,7 +632,7 @@ async def generate_chat_title(
raise HTTPException(status_code=404, detail="Conversation not found")
new_title = await acreate_title_from_history(request.user.object, conversation=conversation)
conversation.slug = new_title[:200]
conversation.slug = clean_text_for_db(new_title[:200])
await conversation.asave()

View File

@@ -833,3 +833,26 @@ def normalize_email(email: str, check_deliverability=False) -> tuple[str, bool]:
return valid_email.normalized, True
except (EmailNotValidError, EmailUndeliverableError):
return lower_email, False
def clean_text_for_db(text):
"""Remove characters that PostgreSQL DB cannot store in text fields.
PostgreSQL text fields cannot contain NUL (0x00) characters.
This is a database-level constraint.
"""
if not isinstance(text, str):
return text
return text.replace("\x00", "")
def clean_object_for_db(data):
"""Recursively clean PostgreSQL-incompatible characters from nested data structures."""
if isinstance(data, str):
return clean_text_for_db(data)
elif isinstance(data, dict):
return {k: clean_object_for_db(v) for k, v in data.items()}
elif isinstance(data, list):
return [clean_object_for_db(item) for item in data]
else:
return data