diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index 52a492af..5e7d67fa 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -23,7 +23,6 @@ Today is {day_of_week}, {current_date} in UTC. - display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\] - Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim. For example: "The weather today is sunny [1](https://weather.com)." -- Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response. - Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code. """.strip() ) @@ -46,7 +45,6 @@ Today is {day_of_week}, {current_date} in UTC. - display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]` - Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim. For example: "The weather today is sunny [1](https://weather.com)." -- Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response. # Instructions:\n{bio} """.strip() @@ -115,45 +113,38 @@ User's Notes: ## Image Generation ## -- -image_generation_improve_prompt_base = """ +enhance_image_system_message = PromptTemplate.from_template( + """ You are a talented media artist with the ability to describe images to compose in professional, fine detail. +Your image description will be transformed into an image by an AI model on your team. {personality_context} -Generate a vivid description of the image to be rendered using the provided context and user prompt below: -Today's Date: {current_date} -User's Location: {location} +# Instructions +- Retain important information and follow instructions by the user when composing the image description. +- Weave in the context provided below if it will enhance the image. +- Specify desired elements, lighting, mood, and composition. +- Add specific, fine position details. Mention painting style, camera parameters to compose the image. +- Transform any negations in user instructions into positive alternatives. + Instead of saying what should NOT be in the image, describe what SHOULD be there instead. + Examples: + - "no sun" → "overcast cloudy sky" + - "don't include people" → "empty landscape" or "solitary scene" +- Ensure your image description is in prose format (e.g no lists, links). +- If any text is to be rendered in the image put it within double quotes in your image description. -User's Notes: +# Context + +## User Location: {location} + +## User Documents {references} -Online References: +## Online References {online_results} -Conversation Log: -{chat_history} +Now generate a vivid description of the image to be rendered. -User Prompt: "{query}" - -Now generate an professional description of the image to generate in vivid, fine detail. -- Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation. -- Retain any important information and follow any instructions in the conversation log or user prompt. -- Add specific, fine position details. Mention painting style, camera parameters to compose the image. -- Ensure your improved prompt is in prose format.""" - -image_generation_improve_prompt_dalle = PromptTemplate.from_template( - f""" -{image_generation_improve_prompt_base} - -Improved Prompt: -""".strip() -) - -image_generation_improve_prompt_sd = PromptTemplate.from_template( - f""" -{image_generation_improve_prompt_base} -- If any text is to be rendered in the image put it within double quotes in your improved prompt. - -Improved Prompt: +Image Description: """.strip() ) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index ca22aea0..dcd66a2e 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -645,9 +645,10 @@ def generate_chatml_messages_with_context( reconstructed_context_message = ChatMessage(content=message_context, role="user") chatml_messages.insert(0, reconstructed_context_message) + # Add generated assets if not is_none_or_empty(chat.images) and role == "assistant": generated_assets["image"] = { - "query": (chat.intent.inferred_queries or [user_message])[0], + "description": (chat.intent.inferred_queries or [user_message])[0], } if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant": diff --git a/src/khoj/processor/image/generate.py b/src/khoj/processor/image/generate.py index fbe4d4f0..f62e751c 100644 --- a/src/khoj/processor/image/generate.py +++ b/src/khoj/processor/image/generate.py @@ -1,6 +1,7 @@ import base64 import io import logging +import os import time from typing import Any, Callable, Dict, List, Optional @@ -21,6 +22,7 @@ from khoj.database.adapters import ConversationAdapters from khoj.database.models import ( Agent, ChatMessageModel, + Intent, KhojUser, TextToImageModelConfig, ) @@ -60,14 +62,17 @@ async def text_to_image( return text2image_model = text_to_image_config.model_name - chat_history_str = "" + image_chat_history: List[ChatMessageModel] = [] + default_intent = Intent(type="remember") for chat in chat_history[-4:]: if chat.by == "you": - chat_history_str += f"Q: {chat.message}\n" + image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)] + elif chat.by == "khoj" and chat.images and chat.intent and chat.intent.inferred_queries: + image_chat_history += [ + ChatMessageModel(by=chat.by, message=chat.intent.inferred_queries[0], intent=default_intent) + ] elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]: - chat_history_str += f"A: {chat.message}\n" - elif chat.by == "khoj" and chat.images: - chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n" + image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)] if send_status_func: async for event in send_status_func("**Enhancing the Painting Prompt**"): @@ -77,7 +82,7 @@ async def text_to_image( # Use the user's message, chat history, and other context image_prompt = await generate_better_image_prompt( message, - chat_history_str, + image_chat_history, location_data=location_data, note_references=references, online_results=online_results, @@ -241,6 +246,11 @@ def generate_image_with_replicate( "output_quality": 100, } } + + seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None + if seed: + json["input"]["seed"] = seed + create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json() # Get status of image generation task diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 88035e9d..31c6c9ed 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -1300,7 +1300,7 @@ async def event_generator( generated_images.append(generated_image) generated_asset_results["images"] = { - "query": improved_image_prompt, + "description": improved_image_prompt, } async for result in send_event( diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 17512121..e9cf748d 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -1066,7 +1066,7 @@ async def generate_mermaidjs_diagram_from_description( async def generate_better_image_prompt( q: str, - conversation_history: str, + conversation_history: List[ChatMessageModel], location_data: LocationData, note_references: List[Dict[str, Any]], online_results: Optional[dict] = None, @@ -1081,7 +1081,6 @@ async def generate_better_image_prompt( Generate a better image prompt from the given query """ - today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A") personality_context = ( prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else "" ) @@ -1089,51 +1088,33 @@ async def generate_better_image_prompt( location = f"{location_data}" if location_data else "Unknown" - user_references = "\n\n".join([f"# {item['compiled']}" for item in note_references]) + user_references = "\n\n".join([f"- text:\n{item['compiled']}" for item in note_references]) simplified_online_results = {} + for result in online_results or []: + if online_results[result].get("answerBox"): + simplified_online_results[result] = online_results[result]["answerBox"] + elif online_results[result].get("webpages"): + simplified_online_results[result] = online_results[result]["webpages"] - if online_results: - for result in online_results: - if online_results[result].get("answerBox"): - simplified_online_results[result] = online_results[result]["answerBox"] - elif online_results[result].get("webpages"): - simplified_online_results[result] = online_results[result]["webpages"] - - if model_type == TextToImageModelConfig.ModelType.OPENAI: - image_prompt = prompts.image_generation_improve_prompt_dalle.format( - query=q, - chat_history=conversation_history, - location=location, - current_date=today_date, - references=user_references, - online_results=simplified_online_results, - personality_context=personality_context, - ) - elif model_type in [ - TextToImageModelConfig.ModelType.STABILITYAI, - TextToImageModelConfig.ModelType.REPLICATE, - TextToImageModelConfig.ModelType.GOOGLE, - ]: - image_prompt = prompts.image_generation_improve_prompt_sd.format( - query=q, - chat_history=conversation_history, - location=location, - current_date=today_date, - references=user_references, - online_results=simplified_online_results, - personality_context=personality_context, - ) + enhance_image_system_message = prompts.enhance_image_system_message.format( + location=location, + references=user_references, + online_results=simplified_online_results or "", + personality_context=personality_context, + ) agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None with timer("Chat actor: Generate contextual image prompt", logger): response = await send_message_to_model_wrapper( - image_prompt, + q, + system_message=enhance_image_system_message, query_images=query_images, - user=user, query_files=query_files, + chat_history=conversation_history, agent_chat_model=agent_chat_model, + user=user, tracer=tracer, ) response_text = response.text.strip()