mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 21:19:12 +00:00
Improve, simplify image generation prompts and context flow
Use seed to stabilize image change consistency across turns when - KHOJ_LLM_SEED env var is set - Using Image models via Replicate OpenAI, Google do not support image seed
This commit is contained in:
@@ -23,7 +23,6 @@ Today is {day_of_week}, {current_date} in UTC.
|
||||
- display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\]
|
||||
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
|
||||
For example: "The weather today is sunny [1](https://weather.com)."
|
||||
- Mention generated assets like images by reference, e.g . Do not manually output raw, b64 encoded bytes in your response.
|
||||
- Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code.
|
||||
""".strip()
|
||||
)
|
||||
@@ -46,7 +45,6 @@ Today is {day_of_week}, {current_date} in UTC.
|
||||
- display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]`
|
||||
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
|
||||
For example: "The weather today is sunny [1](https://weather.com)."
|
||||
- Mention generated assets like images by reference, e.g . Do not manually output raw, b64 encoded bytes in your response.
|
||||
|
||||
# Instructions:\n{bio}
|
||||
""".strip()
|
||||
@@ -115,45 +113,38 @@ User's Notes:
|
||||
## Image Generation
|
||||
## --
|
||||
|
||||
image_generation_improve_prompt_base = """
|
||||
enhance_image_system_message = PromptTemplate.from_template(
|
||||
"""
|
||||
You are a talented media artist with the ability to describe images to compose in professional, fine detail.
|
||||
Your image description will be transformed into an image by an AI model on your team.
|
||||
{personality_context}
|
||||
Generate a vivid description of the image to be rendered using the provided context and user prompt below:
|
||||
|
||||
Today's Date: {current_date}
|
||||
User's Location: {location}
|
||||
# Instructions
|
||||
- Retain important information and follow instructions by the user when composing the image description.
|
||||
- Weave in the context provided below if it will enhance the image.
|
||||
- Specify desired elements, lighting, mood, and composition.
|
||||
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
|
||||
- Transform any negations in user instructions into positive alternatives.
|
||||
Instead of saying what should NOT be in the image, describe what SHOULD be there instead.
|
||||
Examples:
|
||||
- "no sun" → "overcast cloudy sky"
|
||||
- "don't include people" → "empty landscape" or "solitary scene"
|
||||
- Ensure your image description is in prose format (e.g no lists, links).
|
||||
- If any text is to be rendered in the image put it within double quotes in your image description.
|
||||
|
||||
User's Notes:
|
||||
# Context
|
||||
|
||||
## User Location: {location}
|
||||
|
||||
## User Documents
|
||||
{references}
|
||||
|
||||
Online References:
|
||||
## Online References
|
||||
{online_results}
|
||||
|
||||
Conversation Log:
|
||||
{chat_history}
|
||||
Now generate a vivid description of the image to be rendered.
|
||||
|
||||
User Prompt: "{query}"
|
||||
|
||||
Now generate an professional description of the image to generate in vivid, fine detail.
|
||||
- Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation.
|
||||
- Retain any important information and follow any instructions in the conversation log or user prompt.
|
||||
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
|
||||
- Ensure your improved prompt is in prose format."""
|
||||
|
||||
image_generation_improve_prompt_dalle = PromptTemplate.from_template(
|
||||
f"""
|
||||
{image_generation_improve_prompt_base}
|
||||
|
||||
Improved Prompt:
|
||||
""".strip()
|
||||
)
|
||||
|
||||
image_generation_improve_prompt_sd = PromptTemplate.from_template(
|
||||
f"""
|
||||
{image_generation_improve_prompt_base}
|
||||
- If any text is to be rendered in the image put it within double quotes in your improved prompt.
|
||||
|
||||
Improved Prompt:
|
||||
Image Description:
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
||||
@@ -645,9 +645,10 @@ def generate_chatml_messages_with_context(
|
||||
reconstructed_context_message = ChatMessage(content=message_context, role="user")
|
||||
chatml_messages.insert(0, reconstructed_context_message)
|
||||
|
||||
# Add generated assets
|
||||
if not is_none_or_empty(chat.images) and role == "assistant":
|
||||
generated_assets["image"] = {
|
||||
"query": (chat.intent.inferred_queries or [user_message])[0],
|
||||
"description": (chat.intent.inferred_queries or [user_message])[0],
|
||||
}
|
||||
|
||||
if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant":
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
@@ -21,6 +22,7 @@ from khoj.database.adapters import ConversationAdapters
|
||||
from khoj.database.models import (
|
||||
Agent,
|
||||
ChatMessageModel,
|
||||
Intent,
|
||||
KhojUser,
|
||||
TextToImageModelConfig,
|
||||
)
|
||||
@@ -60,14 +62,17 @@ async def text_to_image(
|
||||
return
|
||||
|
||||
text2image_model = text_to_image_config.model_name
|
||||
chat_history_str = ""
|
||||
image_chat_history: List[ChatMessageModel] = []
|
||||
default_intent = Intent(type="remember")
|
||||
for chat in chat_history[-4:]:
|
||||
if chat.by == "you":
|
||||
chat_history_str += f"Q: {chat.message}\n"
|
||||
image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
|
||||
elif chat.by == "khoj" and chat.images and chat.intent and chat.intent.inferred_queries:
|
||||
image_chat_history += [
|
||||
ChatMessageModel(by=chat.by, message=chat.intent.inferred_queries[0], intent=default_intent)
|
||||
]
|
||||
elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
|
||||
chat_history_str += f"A: {chat.message}\n"
|
||||
elif chat.by == "khoj" and chat.images:
|
||||
chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n"
|
||||
image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
|
||||
|
||||
if send_status_func:
|
||||
async for event in send_status_func("**Enhancing the Painting Prompt**"):
|
||||
@@ -77,7 +82,7 @@ async def text_to_image(
|
||||
# Use the user's message, chat history, and other context
|
||||
image_prompt = await generate_better_image_prompt(
|
||||
message,
|
||||
chat_history_str,
|
||||
image_chat_history,
|
||||
location_data=location_data,
|
||||
note_references=references,
|
||||
online_results=online_results,
|
||||
@@ -241,6 +246,11 @@ def generate_image_with_replicate(
|
||||
"output_quality": 100,
|
||||
}
|
||||
}
|
||||
|
||||
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
||||
if seed:
|
||||
json["input"]["seed"] = seed
|
||||
|
||||
create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json()
|
||||
|
||||
# Get status of image generation task
|
||||
|
||||
@@ -1300,7 +1300,7 @@ async def event_generator(
|
||||
generated_images.append(generated_image)
|
||||
|
||||
generated_asset_results["images"] = {
|
||||
"query": improved_image_prompt,
|
||||
"description": improved_image_prompt,
|
||||
}
|
||||
|
||||
async for result in send_event(
|
||||
|
||||
@@ -1066,7 +1066,7 @@ async def generate_mermaidjs_diagram_from_description(
|
||||
|
||||
async def generate_better_image_prompt(
|
||||
q: str,
|
||||
conversation_history: str,
|
||||
conversation_history: List[ChatMessageModel],
|
||||
location_data: LocationData,
|
||||
note_references: List[Dict[str, Any]],
|
||||
online_results: Optional[dict] = None,
|
||||
@@ -1081,7 +1081,6 @@ async def generate_better_image_prompt(
|
||||
Generate a better image prompt from the given query
|
||||
"""
|
||||
|
||||
today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A")
|
||||
personality_context = (
|
||||
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
||||
)
|
||||
@@ -1089,51 +1088,33 @@ async def generate_better_image_prompt(
|
||||
|
||||
location = f"{location_data}" if location_data else "Unknown"
|
||||
|
||||
user_references = "\n\n".join([f"# {item['compiled']}" for item in note_references])
|
||||
user_references = "\n\n".join([f"- text:\n{item['compiled']}" for item in note_references])
|
||||
|
||||
simplified_online_results = {}
|
||||
for result in online_results or []:
|
||||
if online_results[result].get("answerBox"):
|
||||
simplified_online_results[result] = online_results[result]["answerBox"]
|
||||
elif online_results[result].get("webpages"):
|
||||
simplified_online_results[result] = online_results[result]["webpages"]
|
||||
|
||||
if online_results:
|
||||
for result in online_results:
|
||||
if online_results[result].get("answerBox"):
|
||||
simplified_online_results[result] = online_results[result]["answerBox"]
|
||||
elif online_results[result].get("webpages"):
|
||||
simplified_online_results[result] = online_results[result]["webpages"]
|
||||
|
||||
if model_type == TextToImageModelConfig.ModelType.OPENAI:
|
||||
image_prompt = prompts.image_generation_improve_prompt_dalle.format(
|
||||
query=q,
|
||||
chat_history=conversation_history,
|
||||
location=location,
|
||||
current_date=today_date,
|
||||
references=user_references,
|
||||
online_results=simplified_online_results,
|
||||
personality_context=personality_context,
|
||||
)
|
||||
elif model_type in [
|
||||
TextToImageModelConfig.ModelType.STABILITYAI,
|
||||
TextToImageModelConfig.ModelType.REPLICATE,
|
||||
TextToImageModelConfig.ModelType.GOOGLE,
|
||||
]:
|
||||
image_prompt = prompts.image_generation_improve_prompt_sd.format(
|
||||
query=q,
|
||||
chat_history=conversation_history,
|
||||
location=location,
|
||||
current_date=today_date,
|
||||
references=user_references,
|
||||
online_results=simplified_online_results,
|
||||
personality_context=personality_context,
|
||||
)
|
||||
enhance_image_system_message = prompts.enhance_image_system_message.format(
|
||||
location=location,
|
||||
references=user_references,
|
||||
online_results=simplified_online_results or "",
|
||||
personality_context=personality_context,
|
||||
)
|
||||
|
||||
agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None
|
||||
|
||||
with timer("Chat actor: Generate contextual image prompt", logger):
|
||||
response = await send_message_to_model_wrapper(
|
||||
image_prompt,
|
||||
q,
|
||||
system_message=enhance_image_system_message,
|
||||
query_images=query_images,
|
||||
user=user,
|
||||
query_files=query_files,
|
||||
chat_history=conversation_history,
|
||||
agent_chat_model=agent_chat_model,
|
||||
user=user,
|
||||
tracer=tracer,
|
||||
)
|
||||
response_text = response.text.strip()
|
||||
|
||||
Reference in New Issue
Block a user