From 4892e73323f1275a95e3e358b6dfa674600e621e Mon Sep 17 00:00:00 2001 From: Debanjum Date: Tue, 3 Jun 2025 20:44:26 -0700 Subject: [PATCH] Remove unsuppported NUL char from file, chat before save to DB --- src/khoj/database/adapters/__init__.py | 22 +++++++++++++++------- src/khoj/routers/api_chat.py | 3 ++- src/khoj/utils/helpers.py | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index ac121ff9..3080d0f0 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -72,6 +72,8 @@ from khoj.search_filter.word_filter import WordFilter from khoj.utils import state from khoj.utils.config import OfflineChatProcessorModel from khoj.utils.helpers import ( + clean_object_for_db, + clean_text_for_db, generate_random_internal_agent_name, generate_random_name, in_debug_mode, @@ -1032,7 +1034,7 @@ class ConversationAdapters: user=user, client=client_application, id=conversation_id ).afirst() if conversation: - conversation.title = title + conversation.title = clean_text_for_db(title) await conversation.asave() return conversation return None @@ -1432,14 +1434,15 @@ class ConversationAdapters: await Conversation.objects.filter(user=user, client=client_application).order_by("-updated_at").afirst() ) + cleaned_conversation_log = clean_object_for_db(conversation_log) if conversation: - conversation.conversation_log = conversation_log + conversation.conversation_log = cleaned_conversation_log conversation.slug = slug conversation.updated_at = django_timezone.now() await conversation.asave() else: await Conversation.objects.acreate( - user=user, conversation_log=conversation_log, client=client_application, slug=slug + user=user, conversation_log=cleaned_conversation_log, client=client_application, slug=slug ) @staticmethod @@ -1610,6 +1613,7 @@ class ConversationAdapters: conversation_log = conversation.conversation_log updated_log = [msg for msg in conversation_log["chat"] if msg.get("turnId") != turn_id] conversation.conversation_log["chat"] = updated_log + conversation.conversation_log = clean_object_for_db(conversation.conversation_log) conversation.save() return True @@ -1617,13 +1621,15 @@ class ConversationAdapters: class FileObjectAdapters: @staticmethod def update_raw_text(file_object: FileObject, new_raw_text: str): - file_object.raw_text = new_raw_text + cleaned_raw_text = clean_text_for_db(new_raw_text) + file_object.raw_text = cleaned_raw_text file_object.save() @staticmethod @require_valid_user def create_file_object(user: KhojUser, file_name: str, raw_text: str): - return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text) + cleaned_raw_text = clean_text_for_db(raw_text) + return FileObject.objects.create(user=user, file_name=file_name, raw_text=cleaned_raw_text) @staticmethod @require_valid_user @@ -1647,13 +1653,15 @@ class FileObjectAdapters: @staticmethod async def aupdate_raw_text(file_object: FileObject, new_raw_text: str): - file_object.raw_text = new_raw_text + cleaned_raw_text = clean_text_for_db(new_raw_text) + file_object.raw_text = cleaned_raw_text await file_object.asave() @staticmethod @arequire_valid_user async def acreate_file_object(user: KhojUser, file_name: str, raw_text: str): - return await FileObject.objects.acreate(user=user, file_name=file_name, raw_text=raw_text) + cleaned_raw_text = clean_text_for_db(raw_text) + return await FileObject.objects.acreate(user=user, file_name=file_name, raw_text=cleaned_raw_text) @staticmethod @arequire_valid_user diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index a7bb513f..cf3d1207 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -71,6 +71,7 @@ from khoj.routers.storage import upload_user_image_to_bucket from khoj.utils import state from khoj.utils.helpers import ( ConversationCommand, + clean_text_for_db, command_descriptions, convert_image_to_webp, get_country_code_from_timezone, @@ -631,7 +632,7 @@ async def generate_chat_title( raise HTTPException(status_code=404, detail="Conversation not found") new_title = await acreate_title_from_history(request.user.object, conversation=conversation) - conversation.slug = new_title[:200] + conversation.slug = clean_text_for_db(new_title[:200]) await conversation.asave() diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 4e877c11..a01ca3a9 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -833,3 +833,26 @@ def normalize_email(email: str, check_deliverability=False) -> tuple[str, bool]: return valid_email.normalized, True except (EmailNotValidError, EmailUndeliverableError): return lower_email, False + + +def clean_text_for_db(text): + """Remove characters that PostgreSQL DB cannot store in text fields. + + PostgreSQL text fields cannot contain NUL (0x00) characters. + This is a database-level constraint. + """ + if not isinstance(text, str): + return text + return text.replace("\x00", "") + + +def clean_object_for_db(data): + """Recursively clean PostgreSQL-incompatible characters from nested data structures.""" + if isinstance(data, str): + return clean_text_for_db(data) + elif isinstance(data, dict): + return {k: clean_object_for_db(v) for k, v in data.items()} + elif isinstance(data, list): + return [clean_object_for_db(item) for item in data] + else: + return data