mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 21:29:11 +00:00
Improve, simplify image generation prompts and context flow
Use seed to stabilize image change consistency across turns when - KHOJ_LLM_SEED env var is set - Using Image models via Replicate OpenAI, Google do not support image seed
This commit is contained in:
@@ -23,7 +23,6 @@ Today is {day_of_week}, {current_date} in UTC.
|
|||||||
- display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\]
|
- display math mode: insert linebreak after opening $$, \\[ and before closing $$, \\]
|
||||||
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
|
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
|
||||||
For example: "The weather today is sunny [1](https://weather.com)."
|
For example: "The weather today is sunny [1](https://weather.com)."
|
||||||
- Mention generated assets like images by reference, e.g . Do not manually output raw, b64 encoded bytes in your response.
|
|
||||||
- Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code.
|
- Do not respond with raw programs or scripts in your final response unless you know the user is a programmer or has explicitly requested code.
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
@@ -46,7 +45,6 @@ Today is {day_of_week}, {current_date} in UTC.
|
|||||||
- display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]`
|
- display math mode: insert linebreak after opening `$$`, `\\[` and before closing `$$`, `\\]`
|
||||||
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
|
- Provide inline citations to documents and websites referenced. Add them inline in markdown format to directly support your claim.
|
||||||
For example: "The weather today is sunny [1](https://weather.com)."
|
For example: "The weather today is sunny [1](https://weather.com)."
|
||||||
- Mention generated assets like images by reference, e.g . Do not manually output raw, b64 encoded bytes in your response.
|
|
||||||
|
|
||||||
# Instructions:\n{bio}
|
# Instructions:\n{bio}
|
||||||
""".strip()
|
""".strip()
|
||||||
@@ -115,45 +113,38 @@ User's Notes:
|
|||||||
## Image Generation
|
## Image Generation
|
||||||
## --
|
## --
|
||||||
|
|
||||||
image_generation_improve_prompt_base = """
|
enhance_image_system_message = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
You are a talented media artist with the ability to describe images to compose in professional, fine detail.
|
You are a talented media artist with the ability to describe images to compose in professional, fine detail.
|
||||||
|
Your image description will be transformed into an image by an AI model on your team.
|
||||||
{personality_context}
|
{personality_context}
|
||||||
Generate a vivid description of the image to be rendered using the provided context and user prompt below:
|
|
||||||
|
|
||||||
Today's Date: {current_date}
|
# Instructions
|
||||||
User's Location: {location}
|
- Retain important information and follow instructions by the user when composing the image description.
|
||||||
|
- Weave in the context provided below if it will enhance the image.
|
||||||
|
- Specify desired elements, lighting, mood, and composition.
|
||||||
|
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
|
||||||
|
- Transform any negations in user instructions into positive alternatives.
|
||||||
|
Instead of saying what should NOT be in the image, describe what SHOULD be there instead.
|
||||||
|
Examples:
|
||||||
|
- "no sun" → "overcast cloudy sky"
|
||||||
|
- "don't include people" → "empty landscape" or "solitary scene"
|
||||||
|
- Ensure your image description is in prose format (e.g no lists, links).
|
||||||
|
- If any text is to be rendered in the image put it within double quotes in your image description.
|
||||||
|
|
||||||
User's Notes:
|
# Context
|
||||||
|
|
||||||
|
## User Location: {location}
|
||||||
|
|
||||||
|
## User Documents
|
||||||
{references}
|
{references}
|
||||||
|
|
||||||
Online References:
|
## Online References
|
||||||
{online_results}
|
{online_results}
|
||||||
|
|
||||||
Conversation Log:
|
Now generate a vivid description of the image to be rendered.
|
||||||
{chat_history}
|
|
||||||
|
|
||||||
User Prompt: "{query}"
|
Image Description:
|
||||||
|
|
||||||
Now generate an professional description of the image to generate in vivid, fine detail.
|
|
||||||
- Use today's date, user's location, user's notes and online references to weave in any context that will improve the image generation.
|
|
||||||
- Retain any important information and follow any instructions in the conversation log or user prompt.
|
|
||||||
- Add specific, fine position details. Mention painting style, camera parameters to compose the image.
|
|
||||||
- Ensure your improved prompt is in prose format."""
|
|
||||||
|
|
||||||
image_generation_improve_prompt_dalle = PromptTemplate.from_template(
|
|
||||||
f"""
|
|
||||||
{image_generation_improve_prompt_base}
|
|
||||||
|
|
||||||
Improved Prompt:
|
|
||||||
""".strip()
|
|
||||||
)
|
|
||||||
|
|
||||||
image_generation_improve_prompt_sd = PromptTemplate.from_template(
|
|
||||||
f"""
|
|
||||||
{image_generation_improve_prompt_base}
|
|
||||||
- If any text is to be rendered in the image put it within double quotes in your improved prompt.
|
|
||||||
|
|
||||||
Improved Prompt:
|
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -645,9 +645,10 @@ def generate_chatml_messages_with_context(
|
|||||||
reconstructed_context_message = ChatMessage(content=message_context, role="user")
|
reconstructed_context_message = ChatMessage(content=message_context, role="user")
|
||||||
chatml_messages.insert(0, reconstructed_context_message)
|
chatml_messages.insert(0, reconstructed_context_message)
|
||||||
|
|
||||||
|
# Add generated assets
|
||||||
if not is_none_or_empty(chat.images) and role == "assistant":
|
if not is_none_or_empty(chat.images) and role == "assistant":
|
||||||
generated_assets["image"] = {
|
generated_assets["image"] = {
|
||||||
"query": (chat.intent.inferred_queries or [user_message])[0],
|
"description": (chat.intent.inferred_queries or [user_message])[0],
|
||||||
}
|
}
|
||||||
|
|
||||||
if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant":
|
if not is_none_or_empty(chat.mermaidjsDiagram) and role == "assistant":
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import base64
|
import base64
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
from typing import Any, Callable, Dict, List, Optional
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
|
||||||
@@ -21,6 +22,7 @@ from khoj.database.adapters import ConversationAdapters
|
|||||||
from khoj.database.models import (
|
from khoj.database.models import (
|
||||||
Agent,
|
Agent,
|
||||||
ChatMessageModel,
|
ChatMessageModel,
|
||||||
|
Intent,
|
||||||
KhojUser,
|
KhojUser,
|
||||||
TextToImageModelConfig,
|
TextToImageModelConfig,
|
||||||
)
|
)
|
||||||
@@ -60,14 +62,17 @@ async def text_to_image(
|
|||||||
return
|
return
|
||||||
|
|
||||||
text2image_model = text_to_image_config.model_name
|
text2image_model = text_to_image_config.model_name
|
||||||
chat_history_str = ""
|
image_chat_history: List[ChatMessageModel] = []
|
||||||
|
default_intent = Intent(type="remember")
|
||||||
for chat in chat_history[-4:]:
|
for chat in chat_history[-4:]:
|
||||||
if chat.by == "you":
|
if chat.by == "you":
|
||||||
chat_history_str += f"Q: {chat.message}\n"
|
image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
|
||||||
|
elif chat.by == "khoj" and chat.images and chat.intent and chat.intent.inferred_queries:
|
||||||
|
image_chat_history += [
|
||||||
|
ChatMessageModel(by=chat.by, message=chat.intent.inferred_queries[0], intent=default_intent)
|
||||||
|
]
|
||||||
elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
|
elif chat.by == "khoj" and chat.intent and chat.intent.type in ["remember", "reminder"]:
|
||||||
chat_history_str += f"A: {chat.message}\n"
|
image_chat_history += [ChatMessageModel(by=chat.by, message=chat.message, intent=default_intent)]
|
||||||
elif chat.by == "khoj" and chat.images:
|
|
||||||
chat_history_str += f"A: Improved Prompt: {chat.intent.inferred_queries[0]}\n"
|
|
||||||
|
|
||||||
if send_status_func:
|
if send_status_func:
|
||||||
async for event in send_status_func("**Enhancing the Painting Prompt**"):
|
async for event in send_status_func("**Enhancing the Painting Prompt**"):
|
||||||
@@ -77,7 +82,7 @@ async def text_to_image(
|
|||||||
# Use the user's message, chat history, and other context
|
# Use the user's message, chat history, and other context
|
||||||
image_prompt = await generate_better_image_prompt(
|
image_prompt = await generate_better_image_prompt(
|
||||||
message,
|
message,
|
||||||
chat_history_str,
|
image_chat_history,
|
||||||
location_data=location_data,
|
location_data=location_data,
|
||||||
note_references=references,
|
note_references=references,
|
||||||
online_results=online_results,
|
online_results=online_results,
|
||||||
@@ -241,6 +246,11 @@ def generate_image_with_replicate(
|
|||||||
"output_quality": 100,
|
"output_quality": 100,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seed = int(os.getenv("KHOJ_LLM_SEED")) if os.getenv("KHOJ_LLM_SEED") else None
|
||||||
|
if seed:
|
||||||
|
json["input"]["seed"] = seed
|
||||||
|
|
||||||
create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json()
|
create_prediction = requests.post(replicate_create_prediction_url, headers=headers, json=json).json()
|
||||||
|
|
||||||
# Get status of image generation task
|
# Get status of image generation task
|
||||||
|
|||||||
@@ -1300,7 +1300,7 @@ async def event_generator(
|
|||||||
generated_images.append(generated_image)
|
generated_images.append(generated_image)
|
||||||
|
|
||||||
generated_asset_results["images"] = {
|
generated_asset_results["images"] = {
|
||||||
"query": improved_image_prompt,
|
"description": improved_image_prompt,
|
||||||
}
|
}
|
||||||
|
|
||||||
async for result in send_event(
|
async for result in send_event(
|
||||||
|
|||||||
@@ -1066,7 +1066,7 @@ async def generate_mermaidjs_diagram_from_description(
|
|||||||
|
|
||||||
async def generate_better_image_prompt(
|
async def generate_better_image_prompt(
|
||||||
q: str,
|
q: str,
|
||||||
conversation_history: str,
|
conversation_history: List[ChatMessageModel],
|
||||||
location_data: LocationData,
|
location_data: LocationData,
|
||||||
note_references: List[Dict[str, Any]],
|
note_references: List[Dict[str, Any]],
|
||||||
online_results: Optional[dict] = None,
|
online_results: Optional[dict] = None,
|
||||||
@@ -1081,7 +1081,6 @@ async def generate_better_image_prompt(
|
|||||||
Generate a better image prompt from the given query
|
Generate a better image prompt from the given query
|
||||||
"""
|
"""
|
||||||
|
|
||||||
today_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d, %A")
|
|
||||||
personality_context = (
|
personality_context = (
|
||||||
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
|
||||||
)
|
)
|
||||||
@@ -1089,51 +1088,33 @@ async def generate_better_image_prompt(
|
|||||||
|
|
||||||
location = f"{location_data}" if location_data else "Unknown"
|
location = f"{location_data}" if location_data else "Unknown"
|
||||||
|
|
||||||
user_references = "\n\n".join([f"# {item['compiled']}" for item in note_references])
|
user_references = "\n\n".join([f"- text:\n{item['compiled']}" for item in note_references])
|
||||||
|
|
||||||
simplified_online_results = {}
|
simplified_online_results = {}
|
||||||
|
for result in online_results or []:
|
||||||
|
if online_results[result].get("answerBox"):
|
||||||
|
simplified_online_results[result] = online_results[result]["answerBox"]
|
||||||
|
elif online_results[result].get("webpages"):
|
||||||
|
simplified_online_results[result] = online_results[result]["webpages"]
|
||||||
|
|
||||||
if online_results:
|
enhance_image_system_message = prompts.enhance_image_system_message.format(
|
||||||
for result in online_results:
|
location=location,
|
||||||
if online_results[result].get("answerBox"):
|
references=user_references,
|
||||||
simplified_online_results[result] = online_results[result]["answerBox"]
|
online_results=simplified_online_results or "",
|
||||||
elif online_results[result].get("webpages"):
|
personality_context=personality_context,
|
||||||
simplified_online_results[result] = online_results[result]["webpages"]
|
)
|
||||||
|
|
||||||
if model_type == TextToImageModelConfig.ModelType.OPENAI:
|
|
||||||
image_prompt = prompts.image_generation_improve_prompt_dalle.format(
|
|
||||||
query=q,
|
|
||||||
chat_history=conversation_history,
|
|
||||||
location=location,
|
|
||||||
current_date=today_date,
|
|
||||||
references=user_references,
|
|
||||||
online_results=simplified_online_results,
|
|
||||||
personality_context=personality_context,
|
|
||||||
)
|
|
||||||
elif model_type in [
|
|
||||||
TextToImageModelConfig.ModelType.STABILITYAI,
|
|
||||||
TextToImageModelConfig.ModelType.REPLICATE,
|
|
||||||
TextToImageModelConfig.ModelType.GOOGLE,
|
|
||||||
]:
|
|
||||||
image_prompt = prompts.image_generation_improve_prompt_sd.format(
|
|
||||||
query=q,
|
|
||||||
chat_history=conversation_history,
|
|
||||||
location=location,
|
|
||||||
current_date=today_date,
|
|
||||||
references=user_references,
|
|
||||||
online_results=simplified_online_results,
|
|
||||||
personality_context=personality_context,
|
|
||||||
)
|
|
||||||
|
|
||||||
agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None
|
agent_chat_model = AgentAdapters.get_agent_chat_model(agent, user) if agent else None
|
||||||
|
|
||||||
with timer("Chat actor: Generate contextual image prompt", logger):
|
with timer("Chat actor: Generate contextual image prompt", logger):
|
||||||
response = await send_message_to_model_wrapper(
|
response = await send_message_to_model_wrapper(
|
||||||
image_prompt,
|
q,
|
||||||
|
system_message=enhance_image_system_message,
|
||||||
query_images=query_images,
|
query_images=query_images,
|
||||||
user=user,
|
|
||||||
query_files=query_files,
|
query_files=query_files,
|
||||||
|
chat_history=conversation_history,
|
||||||
agent_chat_model=agent_chat_model,
|
agent_chat_model=agent_chat_model,
|
||||||
|
user=user,
|
||||||
tracer=tracer,
|
tracer=tracer,
|
||||||
)
|
)
|
||||||
response_text = response.text.strip()
|
response_text = response.text.strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user