Improve, simplify image generation prompts and context flow

Use seed to stabilize image change consistency across turns when - KHOJ_LLM_SEED env var is set - Using Image models via Replicate OpenAI, Google do not support image seed
2026-03-02 21:19:12 +00:00 · 2025-08-25 00:27:17 -07:00
parent 0fb6020f30
commit 5a2cae3756
5 changed files with 59 additions and 76 deletions
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -23,7 +23,6 @@ Today is {day_of_week}, {current_date} in UTC.
    - display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\]
 - Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
  For example: "The weather today is sunny [1](https://weather.com)."
- Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response.
 - Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code.
 """.strip()
 )
@@ -46,7 +45,6 @@ Today is {day_of_week}, {current_date} in UTC.
    - display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]`
 - Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
  For example: "The weather today is sunny [1](https://weather.com)."
- Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response.

 # Instructions:\n{bio}
 """.strip()
@@ -115,45 +113,38 @@ User's Notes:
 ## Image Generation
 ## --

-image_generation_improve_prompt_base = """
+enhance_image_system_message = PromptTemplate.from_template(
+    """
 You are a talented media artist with the ability to describe images to compose in professional, fine detail.
+Your image description will be transformed into an image by an AI model on your team.
 {personality_context}
-Generate a vivid description of the image to be rendered using the provided context and user prompt below:

-Today's Date: {current_date}
-User's Location: {location}
+# Instructions
+- Retain important information and follow instructions by the user when composing the image description.
+- Weave in the context provided below if it will enhance the image.
+- Specify desired elements, lighting, mood, and composition.
+- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
+- Transform any negations in user instructions into positive alternatives.
+  Instead of saying what should NOT be in the image, describe what SHOULD be there instead.
+  Examples:
+  - "no sun" → "overcast cloudy sky"
+  - "don't include people" → "empty landscape" or "solitary scene"
+- Ensure your image description is in prose format (e.g no lists, links).
+- If any text is to be rendered in the image put it within double quotes in your image description.

-User's Notes:
+# Context
+
+## User Location: {location}
+
+## User Documents
 {references}

-Online References:
+## Online References
 {online_results}

-Conversation Log:
-{chat_history}
+Now generate a vivid description of the image to be rendered.

-User Prompt: "{query}"
-
-Now generate an professional description of the image to generate in vivid, fine detail.
- Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation.
- Retain any important information and follow any instructions in the conversation log or user prompt.
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
- Ensure your improved prompt is in prose format."""
-
-image_generation_improve_prompt_dalle = PromptTemplate.from_template(
-    f"""
-{image_generation_improve_prompt_base}
-
-Improved Prompt:
-""".strip()
-)
-
-image_generation_improve_prompt_sd = PromptTemplate.from_template(
-    f"""
-{image_generation_improve_prompt_base}
- If any text is to be rendered in the image put it within double quotes in your improved prompt.
-
-Improved Prompt:
+Image Description:
 """.strip()
 )

--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -645,9 +645,10 @@ def generate_chatml_messages_with_context(
            reconstructed_context_message = ChatMessage(content=message_context, role="user")
            chatml_messages.insert(0, reconstructed_context_message)

+        # Add generated assets
        if not is_none_or_empty(chat.images) and role == "assistant":
            generated_assets["image"] = {
-                "query": (chat.intent.inferred_queries or [user_message])[0],
+                "description": (chat.intent.inferred_queries or [user_message])[0],
            }

        if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant":
--- a/src/khoj/processor/image/generate.py
+++ b/src/khoj/processor/image/generate.py
@@ -1,6 +1,7 @@
 import base64
 import io
 import logging
+import os
 import time
 from typing import Any, Callable, Dict, List, Optional

@@ -21,6 +22,7 @@ from khoj.database.adapters import ConversationAdapters
 from khoj.database.models import (
    Agent,
    ChatMessageModel,
+    Intent,
    KhojUser,
    TextToImageModelConfig,
 )
@@ -60,14 +62,17 @@ async def text_to_image(
        return

    text2image_model = text_to_image_config.model_name
-    chat_history_str = ""
+    image_chat_history: List[ChatMessageModel] = []
+    default_intent = Intent(type="remember")
    for chat in chat_history[-4:]:
        if chat.by == "you":
-            chat_history_str += f"Q: {chat.message}\n"
+            image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
+        elif chat.by == "khoj" and chat.images and chat.intent and chat.intent.inferred_queries:
+            image_chat_history += [
+                ChatMessageModel(by=chat.by, message=chat.intent.inferred_queries[0], intent=default_intent)
+            ]
        elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
-            chat_history_str += f"A: {chat.message}\n"
-        elif chat.by == "khoj" and chat.images:
-            chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n"
+            image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]

    if send_status_func:
        async for event in send_status_func("**Enhancing the Painting Prompt**"):
@@ -77,7 +82,7 @@ async def text_to_image(
    # Use the user's message, chat history, and other context
    image_prompt = await generate_better_image_prompt(
        message,
-        chat_history_str,
+        image_chat_history,
        location_data=location_data,
        note_references=references,
        online_results=online_results,
@@ -241,6 +246,11 @@ def generate_image_with_replicate(
            "output_quality": 100,
        }
    }
+
+    seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
+    if seed:
+        json["input"]["seed"] = seed
+
    create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json()

    # Get status of image generation task
--- a/src/khoj/routers/api_chat.py
+++ b/src/khoj/routers/api_chat.py
@@ -1300,7 +1300,7 @@ async def event_generator(
            generated_images.append(generated_image)

            generated_asset_results["images"] = {
-                "query": improved_image_prompt,
+                "description": improved_image_prompt,
            }

            async for result in send_event(
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1066,7 +1066,7 @@ async def generate_mermaidjs_diagram_from_description(

 async def generate_better_image_prompt(
    q: str,
-    conversation_history: str,
+    conversation_history: List[ChatMessageModel],
    location_data: LocationData,
    note_references: List[Dict[str, Any]],
    online_results: Optional[dict] = None,
@@ -1081,7 +1081,6 @@ async def generate_better_image_prompt(
    Generate a better image prompt from the given query
    """

-    today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A")
    personality_context = (
        prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
    )
@@ -1089,51 +1088,33 @@ async def generate_better_image_prompt(

    location = f"{location_data}" if location_data else "Unknown"

-    user_references = "\n\n".join([f"# {item['compiled']}" for item in note_references])
+    user_references = "\n\n".join([f"- text:\n{item['compiled']}" for item in note_references])

    simplified_online_results = {}
+    for result in online_results or []:
+        if online_results[result].get("answerBox"):
+            simplified_online_results[result] = online_results[result]["answerBox"]
+        elif online_results[result].get("webpages"):
+            simplified_online_results[result] = online_results[result]["webpages"]

-    if online_results:
-        for result in online_results:
-            if online_results[result].get("answerBox"):
-                simplified_online_results[result] = online_results[result]["answerBox"]
-            elif online_results[result].get("webpages"):
-                simplified_online_results[result] = online_results[result]["webpages"]
-
-    if model_type == TextToImageModelConfig.ModelType.OPENAI:
-        image_prompt = prompts.image_generation_improve_prompt_dalle.format(
-            query=q,
-            chat_history=conversation_history,
-            location=location,
-            current_date=today_date,
-            references=user_references,
-            online_results=simplified_online_results,
-            personality_context=personality_context,
-        )
-    elif model_type in [
-        TextToImageModelConfig.ModelType.STABILITYAI,
-        TextToImageModelConfig.ModelType.REPLICATE,
-        TextToImageModelConfig.ModelType.GOOGLE,
-    ]:
-        image_prompt = prompts.image_generation_improve_prompt_sd.format(
-            query=q,
-            chat_history=conversation_history,
-            location=location,
-            current_date=today_date,
-            references=user_references,
-            online_results=simplified_online_results,
-            personality_context=personality_context,
-        )
+    enhance_image_system_message = prompts.enhance_image_system_message.format(
+        location=location,
+        references=user_references,
+        online_results=simplified_online_results or "",
+        personality_context=personality_context,
+    )

    agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None

    with timer("Chat actor: Generate contextual image prompt", logger):
        response = await send_message_to_model_wrapper(
-            image_prompt,
+            q,
+            system_message=enhance_image_system_message,
            query_images=query_images,
-            user=user,
            query_files=query_files,
+            chat_history=conversation_history,
            agent_chat_model=agent_chat_model,
+            user=user,
            tracer=tracer,
        )
        response_text = response.text.strip()