Add Vision Support (#889)

# Summary of Changes * New UI to show preview of image uploads * ChatML message changes to support gpt-4o vision based responses on images * AWS S3 image uploads for persistent image context in conversations * Database changes to have `vision_enabled` option in server admin panel while configuring models * Render previously uploaded images in the chat history, show uploaded images for pending msgs * Pass the uploaded_image_url through to subqueries * Allow image to render upon first message from the homepage * Add rendering support for images to shared chat as well * Fix some UI/functionality bugs in the share page * Convert user attached images for chat to webp format before upload * Use placeholder to attached image for data source, response mode actors * Update all clients to call /api/chat as a POST instead of GET request * Fix copying chat messages with images to clipboard TLDR; Add vision support for openai models on Khoj via the web UI! --------- Co-authored-by: sabaimran <narmiabas@gmail.com> Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
2026-03-04 05:39:06 +00:00 · 2024-09-09 17:22:18 -05:00
parent b553bba1d8
commit 549686a7a4
33 changed files with 740 additions and 417 deletions
--- a/src/interface/web/app/components/chatHistory/chatHistory.tsx
+++ b/src/interface/web/app/components/chatHistory/chatHistory.tsx
@@ -267,6 +267,7 @@ export default function ChatHistory(props: ChatHistoryProps) {
                                            created: message.timestamp,
                                            by: "you",
                                            automationId: "",
+                                            uploadedImageData: message.uploadedImageData,
                                        }}
                                        customClassName="fullHistory"
                                        borderLeftColor={`${data?.agent.color}-500`}
@@ -309,6 +310,7 @@ export default function ChatHistory(props: ChatHistoryProps) {
                                created: new Date().getTime().toString(),
                                by: "you",
                                automationId: "",
+                                uploadedImageData: props.pendingMessage,
                            }}
                            customClassName="fullHistory"
                            borderLeftColor={`${data?.agent.color}-500`}
--- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
+++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
@@ -16,6 +16,7 @@ import {
    Microphone,
    Notebook,
    Paperclip,
+    X,
    Question,
    Robot,
    Shapes,
@@ -55,6 +56,7 @@ export interface ChatOptions {

 interface ChatInputProps {
    sendMessage: (message: string) => void;
+    sendImage: (image: string) => void;
    sendDisabled: boolean;
    setUploadedFiles?: (files: string[]) => void;
    conversationId?: string | null;
@@ -75,6 +77,9 @@ export default function ChatInputArea(props: ChatInputProps) {
    const [showLoginPrompt, setShowLoginPrompt] = useState(false);

    const [recording, setRecording] = useState(false);
+    const [imageUploaded, setImageUploaded] = useState(false);
+    const [imagePath, setImagePath] = useState<string | null>(null);
+    const [imageData, setImageData] = useState<string | null>(null);
    const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);

    const [progressValue, setProgressValue] = useState(0);
@@ -97,7 +102,30 @@ export default function ChatInputArea(props: ChatInputProps) {
        }
    }, [uploading]);

+    useEffect(() => {
+        async function fetchImageData() {
+            if (imagePath) {
+                const response = await fetch(imagePath);
+                const blob = await response.blob();
+                const reader = new FileReader();
+                reader.onload = function () {
+                    const base64data = reader.result;
+                    setImageData(base64data as string);
+                };
+                reader.readAsDataURL(blob);
+            }
+            setUploading(false);
+        }
+        setUploading(true);
+        fetchImageData();
+    }, [imagePath]);
+
    function onSendMessage() {
+        if (imageUploaded) {
+            setImageUploaded(false);
+            setImagePath(null);
+            props.sendImage(imageData || "");
+        }
        if (!message.trim()) return;

        if (!props.isLoggedIn) {
@@ -142,6 +170,17 @@ export default function ChatInputArea(props: ChatInputProps) {
            setShowLoginPrompt(true);
            return;
        }
+        // check for image file
+        const image_endings = ["jpg", "jpeg", "png"];
+        for (let i = 0; i < files.length; i++) {
+            const file = files[i];
+            const file_extension = file.name.split(".").pop();
+            if (image_endings.includes(file_extension || "")) {
+                setImageUploaded(true);
+                setImagePath(URL.createObjectURL(file));
+                return;
+            }
+        }

        uploadDataForIndexing(
            files,
@@ -287,6 +326,11 @@ export default function ChatInputArea(props: ChatInputProps) {
        setIsDragAndDropping(false);
    }

+    function removeImageUpload() {
+        setImageUploaded(false);
+        setImagePath(null);
+    }
+
    return (
        <>
            {showLoginPrompt && loginRedirectMessage && (
@@ -397,11 +441,24 @@ export default function ChatInputArea(props: ChatInputProps) {
                </div>
            )}
            <div
-                className={`${styles.actualInputArea} items-center justify-between dark:bg-neutral-700`}
+                className={`${styles.actualInputArea} items-center justify-between dark:bg-neutral-700 relative`}
                onDragOver={handleDragOver}
                onDragLeave={handleDragLeave}
                onDrop={handleDragAndDropFiles}
            >
+                {imageUploaded && (
+                    <div className="absolute bottom-[80px] left-0 right-0 dark:bg-neutral-700 bg-white pt-5 pb-5 w-full rounded-lg border dark:border-none grid grid-cols-2">
+                        <div className="pl-4 pr-4">
+                            <img src={imagePath || ""} alt="img" className="w-auto max-h-[100px]" />
+                        </div>
+                        <div className="pl-4 pr-4">
+                            <X
+                                className="w-6 h-6 float-right dark:hover:bg-[hsl(var(--background))] hover:bg-neutral-100 rounded-sm"
+                                onClick={removeImageUpload}
+                            />
+                        </div>
+                    </div>
+                )}
                <input
                    type="file"
                    multiple={true}
@@ -427,6 +484,8 @@ export default function ChatInputArea(props: ChatInputProps) {
                        value={message}
                        onKeyDown={(e) => {
                            if (e.key === "Enter" && !e.shiftKey) {
+                                setImageUploaded(false);
+                                setImagePath(null);
                                e.preventDefault();
                                onSendMessage();
                            }
--- a/src/interface/web/app/components/chatMessage/chatMessage.module.css
+++ b/src/interface/web/app/components/chatMessage/chatMessage.module.css
@@ -53,6 +53,11 @@ div.chatMessageContainer h3 img {
    width: 24px;
 }

+div.you img {
+    height: 16rem;
+    width: auto;
+}
+
 div.you {
    color: hsla(var(--secondary-foreground));
 }
--- a/src/interface/web/app/components/chatMessage/chatMessage.tsx
+++ b/src/interface/web/app/components/chatMessage/chatMessage.tsx
@@ -111,6 +111,7 @@ export interface SingleChatMessage {
    rawQuery?: string;
    intent?: Intent;
    agent?: AgentData;
+    uploadedImageData?: string;
 }

 export interface StreamMessage {
@@ -122,6 +123,7 @@ export interface StreamMessage {
    rawQuery: string;
    timestamp: string;
    agent?: AgentData;
+    uploadedImageData?: string;
 }

 export interface ChatHistoryData {
@@ -203,6 +205,7 @@ interface ChatMessageProps {
    borderLeftColor?: string;
    isLastMessage?: boolean;
    agent?: AgentData;
+    uploadedImageData?: string;
 }

 interface TrainOfThoughtProps {
@@ -273,6 +276,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
 export default function ChatMessage(props: ChatMessageProps) {
    const [copySuccess, setCopySuccess] = useState<boolean>(false);
    const [isHovering, setIsHovering] = useState<boolean>(false);
+    const [textRendered, setTextRendered] = useState<string>("");
    const [markdownRendered, setMarkdownRendered] = useState<string>("");
    const [isPlaying, setIsPlaying] = useState<boolean>(false);
    const [interrupted, setInterrupted] = useState<boolean>(false);
@@ -322,6 +326,10 @@ export default function ChatMessage(props: ChatMessageProps) {
            .replace(/\\\[/g, "LEFTBRACKET")
            .replace(/\\\]/g, "RIGHTBRACKET");

+        if (props.chatMessage.uploadedImageData) {
+            message = `![uploaded image](${props.chatMessage.uploadedImageData})\n\n${message}`;
+        }
+
        if (props.chatMessage.intent && props.chatMessage.intent.type == "text-to-image") {
            message = `![generated image](data:image/png;base64,${message})`;
        } else if (props.chatMessage.intent && props.chatMessage.intent.type == "text-to-image2") {
@@ -340,6 +348,9 @@ export default function ChatMessage(props: ChatMessageProps) {
            message += `\n\n**Inferred Query**\n\n${props.chatMessage.intent["inferred-queries"][0]}`;
        }

+        setTextRendered(message);
+
+        // Render the markdown
        let markdownRendered = md.render(message);

        // Replace placeholders with LaTeX delimiters
@@ -542,7 +553,6 @@ export default function ChatMessage(props: ChatMessageProps) {
            className={constructClasses(props.chatMessage)}
            onMouseLeave={(event) => setIsHovering(false)}
            onMouseEnter={(event) => setIsHovering(true)}
-            onClick={props.chatMessage.by === "khoj" ? (event) => undefined : undefined}
        >
            <div className={chatMessageWrapperClasses(props.chatMessage)}>
                <div
@@ -595,7 +605,7 @@ export default function ChatMessage(props: ChatMessageProps) {
                                title="Copy"
                                className={`${styles.copyButton}`}
                                onClick={() => {
-                                    navigator.clipboard.writeText(props.chatMessage.message);
+                                    navigator.clipboard.writeText(textRendered);
                                    setCopySuccess(true);
                                }}
                            >