Improve, simplify image generation prompts and context flow

Use seed to stabilize image change consistency across turns when - KHOJ_LLM_SEED env var is set - Using Image models via Replicate OpenAI, Google do not support image seed
2026-03-09 21:29:11 +00:00 · 2025-08-25 00:27:17 -07:00
parent 0fb6020f30
commit 5a2cae3756
5 changed files with 59 additions and 76 deletions
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -23,7 +23,6 @@ Today is {day_of_week}, {current_date} in UTC.
    - display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\]
 - Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
  For example: "The weather today is sunny [1](https://weather.com)."
 - Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response.
 - Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code.
 """.strip()
 )
@@ -46,7 +45,6 @@ Today is {day_of_week}, {current_date} in UTC.
    - display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]`
 - Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
  For example: "The weather today is sunny [1](https://weather.com)."
 - Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response.
 # Instructions:\n{bio}
 """.strip()
@@ -115,45 +113,38 @@ User's Notes:
 ## Image Generation
 ## --
-image_generation_improve_prompt_base = """
+enhance_image_system_message = PromptTemplate.from_template(
    """
 You are a talented media artist with the ability to describe images to compose in professional, fine detail.
 Your image description will be transformed into an image by an AI model on your team.
 {personality_context}
 Generate a vivid description of the image to be rendered using the provided context and user prompt below:
-Today's Date: {current_date}
+# Instructions
-User's Location: {location}
+- Retain important information and follow instructions by the user when composing the image description.
 - Weave in the context provided below if it will enhance the image.
 - Specify desired elements, lighting, mood, and composition.
 - Add specific, fine position details. Mention painting style, camera parameters to compose the image.
 - Transform any negations in user instructions into positive alternatives.
  Instead of saying what should NOT be in the image, describe what SHOULD be there instead.
  Examples:
  - "no sun" → "overcast cloudy sky"
  - "don't include people" → "empty landscape" or "solitary scene"
 - Ensure your image description is in prose format (e.g no lists, links).
 - If any text is to be rendered in the image put it within double quotes in your image description.
-User's Notes:
+# Context
 ## User Location: {location}
 ## User Documents
 {references}
-Online References:
+## Online References
 {online_results}
-Conversation Log:
+Now generate a vivid description of the image to be rendered.
 {chat_history}
-User Prompt: "{query}"
+Image Description:
 Now generate an professional description of the image to generate in vivid, fine detail.
 - Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation.
 - Retain any important information and follow any instructions in the conversation log or user prompt.
 - Add specific, fine position details. Mention painting style, camera parameters to compose the image.
 - Ensure your improved prompt is in prose format."""
 image_generation_improve_prompt_dalle = PromptTemplate.from_template(
    f"""
 {image_generation_improve_prompt_base}
 Improved Prompt:
 """.strip()
 )
 image_generation_improve_prompt_sd = PromptTemplate.from_template(
    f"""
 {image_generation_improve_prompt_base}
 - If any text is to be rendered in the image put it within double quotes in your improved prompt.
 Improved Prompt:
 """.strip()
 )
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -645,9 +645,10 @@ def generate_chatml_messages_with_context(
            reconstructed_context_message = ChatMessage(content=message_context, role="user")
            chatml_messages.insert(0, reconstructed_context_message)
        # Add generated assets
        if not is_none_or_empty(chat.images) and role == "assistant":
            generated_assets["image"] = {
-                "query": (chat.intent.inferred_queries or [user_message])[0],
+                "description": (chat.intent.inferred_queries or [user_message])[0],
            }
        if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant":
--- a/src/khoj/processor/image/generate.py
+++ b/src/khoj/processor/image/generate.py
@@ -1,6 +1,7 @@
 import base64
 import io
 import logging
 import os
 import time
 from typing import Any, Callable, Dict, List, Optional
@@ -21,6 +22,7 @@ from khoj.database.adapters import ConversationAdapters
 from khoj.database.models import (
    Agent,
    ChatMessageModel,
    Intent,
    KhojUser,
    TextToImageModelConfig,
 )
@@ -60,14 +62,17 @@ async def text_to_image(
        return
    text2image_model = text_to_image_config.model_name
-    chat_history_str = ""
+    image_chat_history: List[ChatMessageModel] = []
    default_intent = Intent(type="remember")
    for chat in chat_history[-4:]:
        if chat.by == "you":
-            chat_history_str += f"Q: {chat.message}\n"
+            image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
        elif chat.by == "khoj" and chat.images and chat.intent and chat.intent.inferred_queries:
            image_chat_history += [
                ChatMessageModel(by=chat.by, message=chat.intent.inferred_queries[0], intent=default_intent)
            ]
        elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
-            chat_history_str += f"A: {chat.message}\n"
+            image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
        elif chat.by == "khoj" and chat.images:
            chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n"
    if send_status_func:
        async for event in send_status_func("**Enhancing the Painting Prompt**"):
@@ -77,7 +82,7 @@ async def text_to_image(
    # Use the user's message, chat history, and other context
    image_prompt = await generate_better_image_prompt(
        message,
-        chat_history_str,
+        image_chat_history,
        location_data=location_data,
        note_references=references,
        online_results=online_results,
@@ -241,6 +246,11 @@ def generate_image_with_replicate(
            "output_quality": 100,
        }
    }
    seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
    if seed:
        json["input"]["seed"] = seed
    create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json()
    # Get status of image generation task
--- a/src/khoj/routers/api_chat.py
+++ b/src/khoj/routers/api_chat.py
@@ -1300,7 +1300,7 @@ async def event_generator(
            generated_images.append(generated_image)
            generated_asset_results["images"] = {
-                "query": improved_image_prompt,
+                "description": improved_image_prompt,
            }
            async for result in send_event(
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1066,7 +1066,7 @@ async def generate_mermaidjs_diagram_from_description(
 async def generate_better_image_prompt(
    q: str,
-    conversation_history: str,
+    conversation_history: List[ChatMessageModel],
    location_data: LocationData,
    note_references: List[Dict[str, Any]],
    online_results: Optional[dict] = None,
@@ -1081,7 +1081,6 @@ async def generate_better_image_prompt(
    Generate a better image prompt from the given query
    """
    today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A")
    personality_context = (
        prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
    )
@@ -1089,51 +1088,33 @@ async def generate_better_image_prompt(
    location = f"{location_data}" if location_data else "Unknown"
-    user_references = "\n\n".join([f"# {item['compiled']}" for item in note_references])
+    user_references = "\n\n".join([f"- text:\n{item['compiled']}" for item in note_references])
    simplified_online_results = {}
    for result in online_results or []:
        if online_results[result].get("answerBox"):
            simplified_online_results[result] = online_results[result]["answerBox"]
        elif online_results[result].get("webpages"):
            simplified_online_results[result] = online_results[result]["webpages"]
-    if online_results:
+    enhance_image_system_message = prompts.enhance_image_system_message.format(
-        for result in online_results:
+        location=location,
-            if online_results[result].get("answerBox"):
+        references=user_references,
-                simplified_online_results[result] = online_results[result]["answerBox"]
+        online_results=simplified_online_results or "",
-            elif online_results[result].get("webpages"):
+        personality_context=personality_context,
-                simplified_online_results[result] = online_results[result]["webpages"]
+    )
    if model_type == TextToImageModelConfig.ModelType.OPENAI:
        image_prompt = prompts.image_generation_improve_prompt_dalle.format(
            query=q,
            chat_history=conversation_history,
            location=location,
            current_date=today_date,
            references=user_references,
            online_results=simplified_online_results,
            personality_context=personality_context,
        )
    elif model_type in [
        TextToImageModelConfig.ModelType.STABILITYAI,
        TextToImageModelConfig.ModelType.REPLICATE,
        TextToImageModelConfig.ModelType.GOOGLE,
    ]:
        image_prompt = prompts.image_generation_improve_prompt_sd.format(
            query=q,
            chat_history=conversation_history,
            location=location,
            current_date=today_date,
            references=user_references,
            online_results=simplified_online_results,
            personality_context=personality_context,
        )
    agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None
    with timer("Chat actor: Generate contextual image prompt", logger):
        response = await send_message_to_model_wrapper(
-            image_prompt,
+            q,
            system_message=enhance_image_system_message,
            query_images=query_images,
            user=user,
            query_files=query_files,
            chat_history=conversation_history,
            agent_chat_model=agent_chat_model,
            user=user,
            tracer=tracer,
        )
        response_text = response.text.strip()