Add Vision Support (#889)

# Summary of Changes
* New UI to show preview of image uploads
* ChatML message changes to support gpt-4o vision based responses on images
* AWS S3 image uploads for persistent image context in conversations
* Database changes to have `vision_enabled` option in server admin panel while configuring models
* Render previously uploaded images in the chat history, show uploaded images for pending msgs
* Pass the uploaded_image_url through to subqueries
* Allow image to render upon first message from the homepage
* Add rendering support for images to shared chat as well
* Fix some UI/functionality bugs in the share page
* Convert user attached images for chat to webp format before upload
* Use placeholder to attached image for data source, response mode actors
* Update all clients to call /api/chat as a POST instead of GET request
* Fix copying chat messages with images to clipboard

TLDR; Add vision support for openai models on Khoj via the web UI!

---------

Co-authored-by: sabaimran <narmiabas@gmail.com>
Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
Raghav Tirumale
2024-09-09 17:22:18 -05:00
committed by GitHub
parent b553bba1d8
commit 549686a7a4
33 changed files with 740 additions and 417 deletions

View File

@@ -40,9 +40,9 @@ export default function RootLayout({
content="default-src 'self' https://assets.khoj.dev;
media-src * blob:;
script-src 'self' https://assets.khoj.dev 'unsafe-inline' 'unsafe-eval';
connect-src 'self' https://ipapi.co/json ws://localhost:42110;
connect-src 'self' blob: https://ipapi.co/json ws://localhost:42110;
style-src 'self' https://assets.khoj.dev 'unsafe-inline' https://fonts.googleapis.com;
img-src 'self' data: https://*.khoj.dev https://*.googleusercontent.com https://*.google.com/ https://*.gstatic.com;
img-src 'self' data: blob: https://*.khoj.dev https://*.googleusercontent.com https://*.google.com/ https://*.gstatic.com;
font-src 'self' https://assets.khoj.dev https://fonts.gstatic.com;
child-src 'none';
object-src 'none';"

View File

@@ -27,12 +27,14 @@ interface ChatBodyDataProps {
setUploadedFiles: (files: string[]) => void;
isMobileWidth?: boolean;
isLoggedIn: boolean;
setImage64: (image64: string) => void;
}
function ChatBodyData(props: ChatBodyDataProps) {
const searchParams = useSearchParams();
const conversationId = searchParams.get("conversationId");
const [message, setMessage] = useState("");
const [image, setImage] = useState<string | null>(null);
const [processingMessage, setProcessingMessage] = useState(false);
const [agentMetadata, setAgentMetadata] = useState<AgentData | null>(null);
@@ -40,6 +42,19 @@ function ChatBodyData(props: ChatBodyDataProps) {
const onConversationIdChange = props.onConversationIdChange;
useEffect(() => {
if (image) {
props.setImage64(encodeURIComponent(image));
}
}, [image, props.setImage64]);
useEffect(() => {
const storedImage = localStorage.getItem("image");
if (storedImage) {
setImage(storedImage);
props.setImage64(encodeURIComponent(storedImage));
localStorage.removeItem("image");
}
const storedMessage = localStorage.getItem("message");
if (storedMessage) {
setProcessingMessage(true);
@@ -95,6 +110,7 @@ function ChatBodyData(props: ChatBodyDataProps) {
agentColor={agentMetadata?.color}
isLoggedIn={props.isLoggedIn}
sendMessage={(message) => setMessage(message)}
sendImage={(image) => setImage(image)}
sendDisabled={processingMessage}
chatOptionsData={props.chatOptionsData}
conversationId={conversationId}
@@ -116,6 +132,7 @@ export default function Chat() {
const [queryToProcess, setQueryToProcess] = useState<string>("");
const [processQuerySignal, setProcessQuerySignal] = useState(false);
const [uploadedFiles, setUploadedFiles] = useState<string[]>([]);
const [image64, setImage64] = useState<string>("");
const locationData = useIPLocationData();
const authenticatedData = useAuthenticatedData();
const isMobileWidth = useIsMobileWidth();
@@ -148,6 +165,7 @@ export default function Chat() {
completed: false,
timestamp: new Date().toISOString(),
rawQuery: queryToProcess || "",
uploadedImageData: decodeURIComponent(image64),
};
setMessages((prevMessages) => [...prevMessages, newStreamMessage]);
setProcessQuerySignal(true);
@@ -178,6 +196,7 @@ export default function Chat() {
if (done) {
setQueryToProcess("");
setProcessQuerySignal(false);
setImage64("");
break;
}
@@ -218,7 +237,14 @@ export default function Chat() {
chatAPI += `&region=${locationData.region}&country=${locationData.country}&city=${locationData.city}&timezone=${locationData.timezone}`;
}
const response = await fetch(chatAPI);
const response = await fetch(chatAPI, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: image64 ? JSON.stringify({ image: image64 }) : undefined,
});
try {
await readChatStream(response);
} catch (err) {
@@ -282,6 +308,7 @@ export default function Chat() {
setUploadedFiles={setUploadedFiles}
isMobileWidth={isMobileWidth}
onConversationIdChange={handleConversationIdChange}
setImage64={setImage64}
/>
</Suspense>
</div>

View File

@@ -267,6 +267,7 @@ export default function ChatHistory(props: ChatHistoryProps) {
created: message.timestamp,
by: "you",
automationId: "",
uploadedImageData: message.uploadedImageData,
}}
customClassName="fullHistory"
borderLeftColor={`${data?.agent.color}-500`}
@@ -309,6 +310,7 @@ export default function ChatHistory(props: ChatHistoryProps) {
created: new Date().getTime().toString(),
by: "you",
automationId: "",
uploadedImageData: props.pendingMessage,
}}
customClassName="fullHistory"
borderLeftColor={`${data?.agent.color}-500`}

View File

@@ -16,6 +16,7 @@ import {
Microphone,
Notebook,
Paperclip,
X,
Question,
Robot,
Shapes,
@@ -55,6 +56,7 @@ export interface ChatOptions {
interface ChatInputProps {
sendMessage: (message: string) => void;
sendImage: (image: string) => void;
sendDisabled: boolean;
setUploadedFiles?: (files: string[]) => void;
conversationId?: string | null;
@@ -75,6 +77,9 @@ export default function ChatInputArea(props: ChatInputProps) {
const [showLoginPrompt, setShowLoginPrompt] = useState(false);
const [recording, setRecording] = useState(false);
const [imageUploaded, setImageUploaded] = useState(false);
const [imagePath, setImagePath] = useState<string | null>(null);
const [imageData, setImageData] = useState<string | null>(null);
const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
const [progressValue, setProgressValue] = useState(0);
@@ -97,7 +102,30 @@ export default function ChatInputArea(props: ChatInputProps) {
}
}, [uploading]);
useEffect(() => {
async function fetchImageData() {
if (imagePath) {
const response = await fetch(imagePath);
const blob = await response.blob();
const reader = new FileReader();
reader.onload = function () {
const base64data = reader.result;
setImageData(base64data as string);
};
reader.readAsDataURL(blob);
}
setUploading(false);
}
setUploading(true);
fetchImageData();
}, [imagePath]);
function onSendMessage() {
if (imageUploaded) {
setImageUploaded(false);
setImagePath(null);
props.sendImage(imageData || "");
}
if (!message.trim()) return;
if (!props.isLoggedIn) {
@@ -142,6 +170,17 @@ export default function ChatInputArea(props: ChatInputProps) {
setShowLoginPrompt(true);
return;
}
// check for image file
const image_endings = ["jpg", "jpeg", "png"];
for (let i = 0; i < files.length; i++) {
const file = files[i];
const file_extension = file.name.split(".").pop();
if (image_endings.includes(file_extension || "")) {
setImageUploaded(true);
setImagePath(URL.createObjectURL(file));
return;
}
}
uploadDataForIndexing(
files,
@@ -287,6 +326,11 @@ export default function ChatInputArea(props: ChatInputProps) {
setIsDragAndDropping(false);
}
function removeImageUpload() {
setImageUploaded(false);
setImagePath(null);
}
return (
<>
{showLoginPrompt && loginRedirectMessage && (
@@ -397,11 +441,24 @@ export default function ChatInputArea(props: ChatInputProps) {
</div>
)}
<div
className={`${styles.actualInputArea} items-center justify-between dark:bg-neutral-700`}
className={`${styles.actualInputArea} items-center justify-between dark:bg-neutral-700 relative`}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDragAndDropFiles}
>
{imageUploaded && (
<div className="absolute bottom-[80px] left-0 right-0 dark:bg-neutral-700 bg-white pt-5 pb-5 w-full rounded-lg border dark:border-none grid grid-cols-2">
<div className="pl-4 pr-4">
<img src={imagePath || ""} alt="img" className="w-auto max-h-[100px]" />
</div>
<div className="pl-4 pr-4">
<X
className="w-6 h-6 float-right dark:hover:bg-[hsl(var(--background))] hover:bg-neutral-100 rounded-sm"
onClick={removeImageUpload}
/>
</div>
</div>
)}
<input
type="file"
multiple={true}
@@ -427,6 +484,8 @@ export default function ChatInputArea(props: ChatInputProps) {
value={message}
onKeyDown={(e) => {
if (e.key === "Enter" && !e.shiftKey) {
setImageUploaded(false);
setImagePath(null);
e.preventDefault();
onSendMessage();
}

View File

@@ -53,6 +53,11 @@ div.chatMessageContainer h3 img {
width: 24px;
}
div.you img {
height: 16rem;
width: auto;
}
div.you {
color: hsla(var(--secondary-foreground));
}

View File

@@ -111,6 +111,7 @@ export interface SingleChatMessage {
rawQuery?: string;
intent?: Intent;
agent?: AgentData;
uploadedImageData?: string;
}
export interface StreamMessage {
@@ -122,6 +123,7 @@ export interface StreamMessage {
rawQuery: string;
timestamp: string;
agent?: AgentData;
uploadedImageData?: string;
}
export interface ChatHistoryData {
@@ -203,6 +205,7 @@ interface ChatMessageProps {
borderLeftColor?: string;
isLastMessage?: boolean;
agent?: AgentData;
uploadedImageData?: string;
}
interface TrainOfThoughtProps {
@@ -273,6 +276,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
export default function ChatMessage(props: ChatMessageProps) {
const [copySuccess, setCopySuccess] = useState<boolean>(false);
const [isHovering, setIsHovering] = useState<boolean>(false);
const [textRendered, setTextRendered] = useState<string>("");
const [markdownRendered, setMarkdownRendered] = useState<string>("");
const [isPlaying, setIsPlaying] = useState<boolean>(false);
const [interrupted, setInterrupted] = useState<boolean>(false);
@@ -322,6 +326,10 @@ export default function ChatMessage(props: ChatMessageProps) {
.replace(/\\\[/g, "LEFTBRACKET")
.replace(/\\\]/g, "RIGHTBRACKET");
if (props.chatMessage.uploadedImageData) {
message = `![uploaded image](${props.chatMessage.uploadedImageData})\n\n${message}`;
}
if (props.chatMessage.intent && props.chatMessage.intent.type == "text-to-image") {
message = `![generated image](data:image/png;base64,${message})`;
} else if (props.chatMessage.intent && props.chatMessage.intent.type == "text-to-image2") {
@@ -340,6 +348,9 @@ export default function ChatMessage(props: ChatMessageProps) {
message += `\n\n**Inferred Query**\n\n${props.chatMessage.intent["inferred-queries"][0]}`;
}
setTextRendered(message);
// Render the markdown
let markdownRendered = md.render(message);
// Replace placeholders with LaTeX delimiters
@@ -542,7 +553,6 @@ export default function ChatMessage(props: ChatMessageProps) {
className={constructClasses(props.chatMessage)}
onMouseLeave={(event) => setIsHovering(false)}
onMouseEnter={(event) => setIsHovering(true)}
onClick={props.chatMessage.by === "khoj" ? (event) => undefined : undefined}
>
<div className={chatMessageWrapperClasses(props.chatMessage)}>
<div
@@ -595,7 +605,7 @@ export default function ChatMessage(props: ChatMessageProps) {
title="Copy"
className={`${styles.copyButton}`}
onClick={() => {
navigator.clipboard.writeText(props.chatMessage.message);
navigator.clipboard.writeText(textRendered);
setCopySuccess(true);
}}
>

View File

@@ -79,7 +79,7 @@ async function verifyStatement(
let verificationMessage = `${verificationPrecursor} ${message}`;
const apiURL = `${chatURL}?q=${encodeURIComponent(verificationMessage)}&client=web&stream=true&conversation_id=${conversationId}`;
try {
const response = await fetch(apiURL);
const response = await fetch(apiURL, { method: "POST" });
if (!response.body) throw new Error("No response body found");
const reader = response.body?.getReader();

View File

@@ -45,9 +45,9 @@ export default function RootLayout({
content="default-src 'self' https://assets.khoj.dev;
media-src * blob:;
script-src 'self' https://assets.khoj.dev 'unsafe-inline' 'unsafe-eval';
connect-src 'self' https://ipapi.co/json ws://localhost:42110;
connect-src 'self' blob: https://ipapi.co/json ws://localhost:42110;
style-src 'self' https://assets.khoj.dev 'unsafe-inline' https://fonts.googleapis.com;
img-src 'self' data: https://*.khoj.dev https://*.googleusercontent.com https://*.google.com/ https://*.gstatic.com;
img-src 'self' data: blob: https://*.khoj.dev https://*.googleusercontent.com https://*.google.com/ https://*.gstatic.com;
font-src 'self' https://assets.khoj.dev https://fonts.gstatic.com;
child-src 'none';
object-src 'none';"

View File

@@ -43,6 +43,7 @@ function FisherYatesShuffle(array: any[]) {
function ChatBodyData(props: ChatBodyDataProps) {
const [message, setMessage] = useState("");
const [image, setImage] = useState<string | null>(null);
const [processingMessage, setProcessingMessage] = useState(false);
const [greeting, setGreeting] = useState("");
const [shuffledOptions, setShuffledOptions] = useState<Suggestion[]>([]);
@@ -141,6 +142,9 @@ function ChatBodyData(props: ChatBodyDataProps) {
onConversationIdChange?.(newConversationId);
window.location.href = `/chat?conversationId=${newConversationId}`;
localStorage.setItem("message", message);
if (image) {
localStorage.setItem("image", image);
}
} catch (error) {
console.error("Error creating new conversation:", error);
setProcessingMessage(false);
@@ -230,6 +234,7 @@ function ChatBodyData(props: ChatBodyDataProps) {
<ChatInputArea
isLoggedIn={props.isLoggedIn}
sendMessage={(message) => setMessage(message)}
sendImage={(image) => setImage(image)}
sendDisabled={processingMessage}
chatOptionsData={props.chatOptionsData}
conversationId={null}
@@ -310,6 +315,7 @@ function ChatBodyData(props: ChatBodyDataProps) {
<ChatInputArea
isLoggedIn={props.isLoggedIn}
sendMessage={(message) => setMessage(message)}
sendImage={(image) => setImage(image)}
sendDisabled={processingMessage}
chatOptionsData={props.chatOptionsData}
conversationId={null}

View File

@@ -20,9 +20,9 @@ export default function RootLayout({
httpEquiv="Content-Security-Policy"
content="default-src 'self' https://assets.khoj.dev;
script-src 'self' https://assets.khoj.dev 'unsafe-inline' 'unsafe-eval';
connect-src 'self' https://ipapi.co/json ws://localhost:42110;
connect-src 'self' blob: https://ipapi.co/json ws://localhost:42110;
style-src 'self' https://assets.khoj.dev 'unsafe-inline' https://fonts.googleapis.com;
img-src 'self' data: https://*.khoj.dev https://*.googleusercontent.com https://*.google.com/ https://*.gstatic.com;
img-src 'self' data: blob: https://*.khoj.dev https://*.googleusercontent.com https://*.google.com/ https://*.gstatic.com;
font-src 'self' https://assets.khoj.dev https://fonts.gstatic.com;
child-src 'none';
object-src 'none';"

View File

@@ -28,16 +28,23 @@ interface ChatBodyDataProps {
isLoggedIn: boolean;
conversationId?: string;
setQueryToProcess: (query: string) => void;
setImage64: (image64: string) => void;
}
function ChatBodyData(props: ChatBodyDataProps) {
const [message, setMessage] = useState("");
const [image, setImage] = useState<string | null>(null);
const [processingMessage, setProcessingMessage] = useState(false);
const [agentMetadata, setAgentMetadata] = useState<AgentData | null>(null);
const setQueryToProcess = props.setQueryToProcess;
const streamedMessages = props.streamedMessages;
useEffect(() => {
if (image) {
props.setImage64(encodeURIComponent(image));
}
}, [image, props.setImage64]);
useEffect(() => {
if (message) {
setProcessingMessage(true);
@@ -74,11 +81,12 @@ function ChatBodyData(props: ChatBodyDataProps) {
/>
</div>
<div
className={`${styles.inputBox} shadow-md bg-background align-middle items-center justify-center px-3`}
className={`${styles.inputBox} p-1 md:px-2 shadow-md bg-background align-middle items-center justify-center dark:bg-neutral-700 dark:border-0 dark:shadow-sm rounded-t-2xl rounded-b-none md:rounded-xl`}
>
<ChatInputArea
isLoggedIn={props.isLoggedIn}
sendMessage={(message) => setMessage(message)}
sendImage={(image) => setImage(image)}
sendDisabled={processingMessage}
chatOptionsData={props.chatOptionsData}
conversationId={props.conversationId}
@@ -101,6 +109,7 @@ export default function SharedChat() {
const [processQuerySignal, setProcessQuerySignal] = useState(false);
const [uploadedFiles, setUploadedFiles] = useState<string[]>([]);
const [paramSlug, setParamSlug] = useState<string | undefined>(undefined);
const [image64, setImage64] = useState<string>("");
const locationData = useIPLocationData();
const authenticatedData = useAuthenticatedData();
@@ -156,6 +165,7 @@ export default function SharedChat() {
completed: false,
timestamp: new Date().toISOString(),
rawQuery: queryToProcess || "",
uploadedImageData: decodeURIComponent(image64),
};
setMessages((prevMessages) => [...prevMessages, newStreamMessage]);
setProcessQuerySignal(true);
@@ -182,6 +192,7 @@ export default function SharedChat() {
if (done) {
setQueryToProcess("");
setProcessQuerySignal(false);
setImage64("");
break;
}
@@ -216,33 +227,21 @@ export default function SharedChat() {
chatAPI += `&region=${locationData.region}&country=${locationData.country}&city=${locationData.city}&timezone=${locationData.timezone}`;
}
const response = await fetch(chatAPI);
const response = await fetch(chatAPI, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: image64 ? JSON.stringify({ image: image64 }) : undefined,
});
try {
await readChatStream(response);
} catch (err) {
console.log(err);
} catch (error) {
console.error(error);
}
}
useEffect(() => {
(async () => {
if (conversationId) {
// Add a new object to the state
const newStreamMessage: StreamMessage = {
rawResponse: "",
trainOfThought: [],
context: [],
onlineContext: {},
completed: false,
timestamp: new Date().toISOString(),
rawQuery: queryToProcess || "",
};
setProcessQuerySignal(true);
setMessages((prevMessages) => [...prevMessages, newStreamMessage]);
}
})();
}, [conversationId, queryToProcess]);
if (isLoading) {
return <Loading />;
}
@@ -275,6 +274,7 @@ export default function SharedChat() {
setTitle={setTitle}
setUploadedFiles={setUploadedFiles}
isMobileWidth={isMobileWidth}
setImage64={setImage64}
/>
</Suspense>
</div>

View File

@@ -12,15 +12,11 @@ div.main {
div.inputBox {
border: 1px solid var(--border-color);
border-radius: 16px;
margin-bottom: 20px;
gap: 12px;
padding-left: 20px;
padding-right: 20px;
align-content: center;
}
input.inputBox {
border: none;
}
@@ -31,7 +27,7 @@ input.inputBox:focus {
}
div.inputBox:focus {
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
box-shadow: 0px 8px 16px 0px rgba(0, 0, 0, 0.2);
}
div.chatBodyFull {
@@ -94,7 +90,6 @@ div.agentIndicator {
padding: 10px;
}
@media (max-width: 768px) {
div.chatBody {
grid-template-columns: 0fr 1fr;
@@ -124,5 +119,4 @@ div.agentIndicator {
gap: 0;
grid-template-columns: 1fr;
}
}