Add Vision Support (#889)

# Summary of Changes
* New UI to show preview of image uploads
* ChatML message changes to support gpt-4o vision based responses on images
* AWS S3 image uploads for persistent image context in conversations
* Database changes to have `vision_enabled` option in server admin panel while configuring models
* Render previously uploaded images in the chat history, show uploaded images for pending msgs
* Pass the uploaded_image_url through to subqueries
* Allow image to render upon first message from the homepage
* Add rendering support for images to shared chat as well
* Fix some UI/functionality bugs in the share page
* Convert user attached images for chat to webp format before upload
* Use placeholder to attached image for data source, response mode actors
* Update all clients to call /api/chat as a POST instead of GET request
* Fix copying chat messages with images to clipboard

TLDR; Add vision support for openai models on Khoj via the web UI!

---------

Co-authored-by: sabaimran <narmiabas@gmail.com>
Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
Raghav Tirumale
2024-09-09 17:22:18 -05:00
committed by GitHub
parent b553bba1d8
commit 549686a7a4
33 changed files with 740 additions and 417 deletions

View File

@@ -16,6 +16,7 @@ import {
Microphone,
Notebook,
Paperclip,
X,
Question,
Robot,
Shapes,
@@ -55,6 +56,7 @@ export interface ChatOptions {
interface ChatInputProps {
sendMessage: (message: string) => void;
sendImage: (image: string) => void;
sendDisabled: boolean;
setUploadedFiles?: (files: string[]) => void;
conversationId?: string | null;
@@ -75,6 +77,9 @@ export default function ChatInputArea(props: ChatInputProps) {
const [showLoginPrompt, setShowLoginPrompt] = useState(false);
const [recording, setRecording] = useState(false);
const [imageUploaded, setImageUploaded] = useState(false);
const [imagePath, setImagePath] = useState<string | null>(null);
const [imageData, setImageData] = useState<string | null>(null);
const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
const [progressValue, setProgressValue] = useState(0);
@@ -97,7 +102,30 @@ export default function ChatInputArea(props: ChatInputProps) {
}
}, [uploading]);
useEffect(() => {
async function fetchImageData() {
if (imagePath) {
const response = await fetch(imagePath);
const blob = await response.blob();
const reader = new FileReader();
reader.onload = function () {
const base64data = reader.result;
setImageData(base64data as string);
};
reader.readAsDataURL(blob);
}
setUploading(false);
}
setUploading(true);
fetchImageData();
}, [imagePath]);
function onSendMessage() {
if (imageUploaded) {
setImageUploaded(false);
setImagePath(null);
props.sendImage(imageData || "");
}
if (!message.trim()) return;
if (!props.isLoggedIn) {
@@ -142,6 +170,17 @@ export default function ChatInputArea(props: ChatInputProps) {
setShowLoginPrompt(true);
return;
}
// check for image file
const image_endings = ["jpg", "jpeg", "png"];
for (let i = 0; i < files.length; i++) {
const file = files[i];
const file_extension = file.name.split(".").pop();
if (image_endings.includes(file_extension || "")) {
setImageUploaded(true);
setImagePath(URL.createObjectURL(file));
return;
}
}
uploadDataForIndexing(
files,
@@ -287,6 +326,11 @@ export default function ChatInputArea(props: ChatInputProps) {
setIsDragAndDropping(false);
}
function removeImageUpload() {
setImageUploaded(false);
setImagePath(null);
}
return (
<>
{showLoginPrompt && loginRedirectMessage && (
@@ -397,11 +441,24 @@ export default function ChatInputArea(props: ChatInputProps) {
</div>
)}
<div
className={`${styles.actualInputArea} items-center justify-between dark:bg-neutral-700`}
className={`${styles.actualInputArea} items-center justify-between dark:bg-neutral-700 relative`}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDragAndDropFiles}
>
{imageUploaded && (
<div className="absolute bottom-[80px] left-0 right-0 dark:bg-neutral-700 bg-white pt-5 pb-5 w-full rounded-lg border dark:border-none grid grid-cols-2">
<div className="pl-4 pr-4">
<img src={imagePath || ""} alt="img" className="w-auto max-h-[100px]" />
</div>
<div className="pl-4 pr-4">
<X
className="w-6 h-6 float-right dark:hover:bg-[hsl(var(--background))] hover:bg-neutral-100 rounded-sm"
onClick={removeImageUpload}
/>
</div>
</div>
)}
<input
type="file"
multiple={true}
@@ -427,6 +484,8 @@ export default function ChatInputArea(props: ChatInputProps) {
value={message}
onKeyDown={(e) => {
if (e.key === "Enter" && !e.shiftKey) {
setImageUploaded(false);
setImagePath(null);
e.preventDefault();
onSendMessage();
}