Improve, simplify image generation prompts and context flow

Use seed to stabilize image change consistency across turns when
- KHOJ_LLM_SEED env var is set
- Using Image models via Replicate
  OpenAI, Google do not support image seed
This commit is contained in:
Debanjum
2025-08-25 00:27:17 -07:00
parent 0fb6020f30
commit 5a2cae3756
5 changed files with 59 additions and 76 deletions

View File

@@ -23,7 +23,6 @@ Today is {day_of_week}, {current_date} in UTC.
- display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\]
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
For example: "The weather today is sunny [1](https://weather.com)."
- Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response.
- Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code.
""".strip()
)
@@ -46,7 +45,6 @@ Today is {day_of_week}, {current_date} in UTC.
- display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]`
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
For example: "The weather today is sunny [1](https://weather.com)."
- Mention generated assets like images by reference, e.g ![chart](/visualization/image.png). Do not manually output raw, b64 encoded bytes in your response.
# Instructions:\n{bio}
""".strip()
@@ -115,45 +113,38 @@ User's Notes:
## Image Generation
## --
image_generation_improve_prompt_base = """
enhance_image_system_message = PromptTemplate.from_template(
"""
You are a talented media artist with the ability to describe images to compose in professional, fine detail.
Your image description will be transformed into an image by an AI model on your team.
{personality_context}
Generate a vivid description of the image to be rendered using the provided context and user prompt below:
Today's Date: {current_date}
User's Location: {location}
# Instructions
- Retain important information and follow instructions by the user when composing the image description.
- Weave in the context provided below if it will enhance the image.
- Specify desired elements, lighting, mood, and composition.
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
- Transform any negations in user instructions into positive alternatives.
Instead of saying what should NOT be in the image, describe what SHOULD be there instead.
Examples:
- "no sun""overcast cloudy sky"
- "don't include people""empty landscape" or "solitary scene"
- Ensure your image description is in prose format (e.g no lists, links).
- If any text is to be rendered in the image put it within double quotes in your image description.
User's Notes:
# Context
## User Location: {location}
## User Documents
{references}
Online References:
## Online References
{online_results}
Conversation Log:
{chat_history}
Now generate a vivid description of the image to be rendered.
User Prompt: "{query}"
Now generate an professional description of the image to generate in vivid, fine detail.
- Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation.
- Retain any important information and follow any instructions in the conversation log or user prompt.
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
- Ensure your improved prompt is in prose format."""
image_generation_improve_prompt_dalle = PromptTemplate.from_template(
f"""
{image_generation_improve_prompt_base}
Improved Prompt:
""".strip()
)
image_generation_improve_prompt_sd = PromptTemplate.from_template(
f"""
{image_generation_improve_prompt_base}
- If any text is to be rendered in the image put it within double quotes in your improved prompt.
Improved Prompt:
Image Description:
""".strip()
)

View File

@@ -645,9 +645,10 @@ def generate_chatml_messages_with_context(
reconstructed_context_message = ChatMessage(content=message_context, role="user")
chatml_messages.insert(0, reconstructed_context_message)
# Add generated assets
if not is_none_or_empty(chat.images) and role == "assistant":
generated_assets["image"] = {
"query": (chat.intent.inferred_queries or [user_message])[0],
"description": (chat.intent.inferred_queries or [user_message])[0],
}
if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant":

View File

@@ -1,6 +1,7 @@
import base64
import io
import logging
import os
import time
from typing import Any, Callable, Dict, List, Optional
@@ -21,6 +22,7 @@ from khoj.database.adapters import ConversationAdapters
from khoj.database.models import (
Agent,
ChatMessageModel,
Intent,
KhojUser,
TextToImageModelConfig,
)
@@ -60,14 +62,17 @@ async def text_to_image(
return
text2image_model = text_to_image_config.model_name
chat_history_str = ""
image_chat_history: List[ChatMessageModel] = []
default_intent = Intent(type="remember")
for chat in chat_history[-4:]:
if chat.by == "you":
chat_history_str += f"Q: {chat.message}\n"
image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
elif chat.by == "khoj" and chat.images and chat.intent and chat.intent.inferred_queries:
image_chat_history += [
ChatMessageModel(by=chat.by, message=chat.intent.inferred_queries[0], intent=default_intent)
]
elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
chat_history_str += f"A: {chat.message}\n"
elif chat.by == "khoj" and chat.images:
chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n"
image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
if send_status_func:
async for event in send_status_func("**Enhancing the Painting Prompt**"):
@@ -77,7 +82,7 @@ async def text_to_image(
# Use the user's message, chat history, and other context
image_prompt = await generate_better_image_prompt(
message,
chat_history_str,
image_chat_history,
location_data=location_data,
note_references=references,
online_results=online_results,
@@ -241,6 +246,11 @@ def generate_image_with_replicate(
"output_quality": 100,
}
}
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
if seed:
json["input"]["seed"] = seed
create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json()
# Get status of image generation task

View File

@@ -1300,7 +1300,7 @@ async def event_generator(
generated_images.append(generated_image)
generated_asset_results["images"] = {
"query": improved_image_prompt,
"description": improved_image_prompt,
}
async for result in send_event(

View File

@@ -1066,7 +1066,7 @@ async def generate_mermaidjs_diagram_from_description(
async def generate_better_image_prompt(
q: str,
conversation_history: str,
conversation_history: List[ChatMessageModel],
location_data: LocationData,
note_references: List[Dict[str, Any]],
online_results: Optional[dict] = None,
@@ -1081,7 +1081,6 @@ async def generate_better_image_prompt(
Generate a better image prompt from the given query
"""
today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A")
personality_context = (
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
)
@@ -1089,51 +1088,33 @@ async def generate_better_image_prompt(
location = f"{location_data}" if location_data else "Unknown"
user_references = "\n\n".join([f"# {item['compiled']}" for item in note_references])
user_references = "\n\n".join([f"- text:\n{item['compiled']}" for item in note_references])
simplified_online_results = {}
for result in online_results or []:
if online_results[result].get("answerBox"):
simplified_online_results[result] = online_results[result]["answerBox"]
elif online_results[result].get("webpages"):
simplified_online_results[result] = online_results[result]["webpages"]
if online_results:
for result in online_results:
if online_results[result].get("answerBox"):
simplified_online_results[result] = online_results[result]["answerBox"]
elif online_results[result].get("webpages"):
simplified_online_results[result] = online_results[result]["webpages"]
if model_type == TextToImageModelConfig.ModelType.OPENAI:
image_prompt = prompts.image_generation_improve_prompt_dalle.format(
query=q,
chat_history=conversation_history,
location=location,
current_date=today_date,
references=user_references,
online_results=simplified_online_results,
personality_context=personality_context,
)
elif model_type in [
TextToImageModelConfig.ModelType.STABILITYAI,
TextToImageModelConfig.ModelType.REPLICATE,
TextToImageModelConfig.ModelType.GOOGLE,
]:
image_prompt = prompts.image_generation_improve_prompt_sd.format(
query=q,
chat_history=conversation_history,
location=location,
current_date=today_date,
references=user_references,
online_results=simplified_online_results,
personality_context=personality_context,
)
enhance_image_system_message = prompts.enhance_image_system_message.format(
location=location,
references=user_references,
online_results=simplified_online_results or "",
personality_context=personality_context,
)
agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None
with timer("Chat actor: Generate contextual image prompt", logger):
response = await send_message_to_model_wrapper(
image_prompt,
q,
system_message=enhance_image_system_message,
query_images=query_images,
user=user,
query_files=query_files,
chat_history=conversation_history,
agent_chat_model=agent_chat_model,
user=user,
tracer=tracer,
)
response_text = response.text.strip()