Add Vision Support (#889)

# Summary of Changes
* New UI to show preview of image uploads
* ChatML message changes to support gpt-4o vision based responses on images
* AWS S3 image uploads for persistent image context in conversations
* Database changes to have `vision_enabled` option in server admin panel while configuring models
* Render previously uploaded images in the chat history, show uploaded images for pending msgs
* Pass the uploaded_image_url through to subqueries
* Allow image to render upon first message from the homepage
* Add rendering support for images to shared chat as well
* Fix some UI/functionality bugs in the share page
* Convert user attached images for chat to webp format before upload
* Use placeholder to attached image for data source, response mode actors
* Update all clients to call /api/chat as a POST instead of GET request
* Fix copying chat messages with images to clipboard

TLDR; Add vision support for openai models on Khoj via the web UI!

---------

Co-authored-by: sabaimran <narmiabas@gmail.com>
Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
Raghav Tirumale
2024-09-09 17:22:18 -05:00
committed by GitHub
parent b553bba1d8
commit 549686a7a4
33 changed files with 740 additions and 417 deletions

View File

@@ -53,6 +53,11 @@ div.chatMessageContainer h3 img {
width: 24px;
}
div.you img {
height: 16rem;
width: auto;
}
div.you {
color: hsla(var(--secondary-foreground));
}

View File

@@ -111,6 +111,7 @@ export interface SingleChatMessage {
rawQuery?: string;
intent?: Intent;
agent?: AgentData;
uploadedImageData?: string;
}
export interface StreamMessage {
@@ -122,6 +123,7 @@ export interface StreamMessage {
rawQuery: string;
timestamp: string;
agent?: AgentData;
uploadedImageData?: string;
}
export interface ChatHistoryData {
@@ -203,6 +205,7 @@ interface ChatMessageProps {
borderLeftColor?: string;
isLastMessage?: boolean;
agent?: AgentData;
uploadedImageData?: string;
}
interface TrainOfThoughtProps {
@@ -273,6 +276,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
export default function ChatMessage(props: ChatMessageProps) {
const [copySuccess, setCopySuccess] = useState<boolean>(false);
const [isHovering, setIsHovering] = useState<boolean>(false);
const [textRendered, setTextRendered] = useState<string>("");
const [markdownRendered, setMarkdownRendered] = useState<string>("");
const [isPlaying, setIsPlaying] = useState<boolean>(false);
const [interrupted, setInterrupted] = useState<boolean>(false);
@@ -322,6 +326,10 @@ export default function ChatMessage(props: ChatMessageProps) {
.replace(/\\\[/g, "LEFTBRACKET")
.replace(/\\\]/g, "RIGHTBRACKET");
if (props.chatMessage.uploadedImageData) {
message = `![uploaded image](${props.chatMessage.uploadedImageData})\n\n${message}`;
}
if (props.chatMessage.intent && props.chatMessage.intent.type == "text-to-image") {
message = `![generated image](data:image/png;base64,${message})`;
} else if (props.chatMessage.intent && props.chatMessage.intent.type == "text-to-image2") {
@@ -340,6 +348,9 @@ export default function ChatMessage(props: ChatMessageProps) {
message += `\n\n**Inferred Query**\n\n${props.chatMessage.intent["inferred-queries"][0]}`;
}
setTextRendered(message);
// Render the markdown
let markdownRendered = md.render(message);
// Replace placeholders with LaTeX delimiters
@@ -542,7 +553,6 @@ export default function ChatMessage(props: ChatMessageProps) {
className={constructClasses(props.chatMessage)}
onMouseLeave={(event) => setIsHovering(false)}
onMouseEnter={(event) => setIsHovering(true)}
onClick={props.chatMessage.by === "khoj" ? (event) => undefined : undefined}
>
<div className={chatMessageWrapperClasses(props.chatMessage)}>
<div
@@ -595,7 +605,7 @@ export default function ChatMessage(props: ChatMessageProps) {
title="Copy"
className={`${styles.copyButton}`}
onClick={() => {
navigator.clipboard.writeText(props.chatMessage.message);
navigator.clipboard.writeText(textRendered);
setCopySuccess(true);
}}
>