Initial commit: add a dedicated page for managing the knowledge base

- One current issue in the Khoj application is that managing the files being referenced as the user's knowledge base is slightly opaque and difficult to access
- Add a migration for associating the fileobjects directly with the Entry objects, making it easier to get data via foreign key
- Add the new page that shows all indexed files in the search view, also allowing you to upload new docs directly from that page
- Support new APIs for getting / deleting files
This commit is contained in:
sabaimran
2025-01-10 16:24:50 -08:00
parent 65f1c27963
commit 454a752071
10 changed files with 788 additions and 56 deletions

View File

@@ -94,3 +94,33 @@ export function useDebounce<T>(value: T, delay: number): T {
return debouncedValue;
}
export const formatDateTime = (isoString: string): string => {
try {
const date = new Date(isoString);
const now = new Date();
const diffInMinutes = Math.floor((now.getTime() - date.getTime()) / 60000);
// Show relative time for recent dates
if (diffInMinutes < 1) return "just now";
if (diffInMinutes < 60) return `${diffInMinutes} minutes ago`;
if (diffInMinutes < 120) return "1 hour ago";
if (diffInMinutes < 1440) return `${Math.floor(diffInMinutes / 60)} hours ago`;
// For older dates, show full formatted date
const formatter = new Intl.DateTimeFormat("en-US", {
month: "long",
day: "numeric",
year: "numeric",
hour: "numeric",
minute: "2-digit",
hour12: true,
timeZoneName: "short",
});
return formatter.format(date);
} catch (error) {
console.error("Error formatting date:", error);
return isoString;
}
};

View File

@@ -17,7 +17,7 @@ import {
KhojSearchLogo,
} from "../logo/khojLogo";
import { Gear } from "@phosphor-icons/react/dist/ssr";
import { Plus } from "@phosphor-icons/react";
import { Book, Plus } from "@phosphor-icons/react";
import { useEffect, useState } from "react";
import AllConversations from "../allConversations/allConversations";
import FooterMenu from "../navMenu/navMenu";
@@ -26,6 +26,7 @@ import { useIsMobileWidth } from "@/app/common/utils";
import { UserPlusIcon } from "lucide-react";
import { useAuthenticatedData } from "@/app/common/auth";
import LoginPrompt from "../loginPrompt/loginPrompt";
import { url } from "inspector";
// Menu items.
const items = [
@@ -54,6 +55,11 @@ const items = [
url: "/settings",
icon: Gear,
},
{
title: "Knowledge Base",
url: "/knowledge",
icon: Book,
},
];
const SIDEBAR_KEYBOARD_SHORTCUT = "b";

View File

@@ -0,0 +1,93 @@
"use client";
import { useState, useEffect } from "react";
import { SidebarInset, SidebarProvider, SidebarTrigger } from "@/components/ui/sidebar";
import { AppSidebar } from "../components/appSidebar/appSidebar";
import { Separator } from "@/components/ui/separator";
import { KhojLogoType } from "../components/logo/khojLogo";
import { Card, CardHeader, CardTitle, CardContent } from "@/components/ui/card";
import { useIsMobileWidth } from "../common/utils";
import { InlineLoading } from "../components/loading/loading";
interface FileObject {
file_name: string;
raw_text: string;
}
export default function KnowledgeBase() {
const [files, setFiles] = useState<FileObject[]>([]);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const isMobileWidth = useIsMobileWidth();
useEffect(() => {
const fetchFiles = async () => {
try {
const response = await fetch("/api/content/all");
if (!response.ok) throw new Error("Failed to fetch files");
const filesList = await response.json();
if (Array.isArray(filesList)) {
setFiles(filesList.toSorted());
}
} catch (error) {
setError("Failed to load files");
console.error("Error fetching files:", error);
} finally {
setLoading(false);
}
};
fetchFiles();
}, []);
return (
<SidebarProvider>
<AppSidebar conversationId={""} />
<SidebarInset>
<header className="flex h-16 shrink-0 items-center gap-2 border-b px-4">
<SidebarTrigger className="-ml-1" />
<Separator orientation="vertical" className="mr-2 h-4" />
{isMobileWidth ? (
<a className="p-0 no-underline" href="/">
<KhojLogoType className="h-auto w-16" />
</a>
) : (
<h2 className="text-lg">Knowledge Base</h2>
)}
</header>
<main>
<div className="md:w-3/4 sm:w-full mx-auto pt-6 md:pt-8">
{loading && (
<div className="mt-4 flex items-center justify-center">
<InlineLoading
className="mt-4"
message={"Loading"}
iconClassName="h-5 w-5"
/>
</div>
)}
{error && <div className="text-red-500">{error}</div>}
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
{files.map((file, index) => (
<Card key={index}>
<CardHeader>
<CardTitle className="text-sm font-medium">
{file.file_name.split("/").pop()}
</CardTitle>
</CardHeader>
<CardContent>
<p className="text-sm text-muted-foreground">
{file.raw_text.slice(0, 100)}...
</p>
</CardContent>
</Card>
))}
</div>
</div>
</main>
</SidebarInset>
</SidebarProvider>
);
}

View File

@@ -2,6 +2,7 @@ import type { Metadata } from "next";
import "../globals.css";
import { ContentSecurityPolicy } from "../common/layoutHelper";
import { Toaster } from "@/components/ui/toaster";
export const metadata: Metadata = {
title: "Khoj AI - Search",
@@ -35,7 +36,10 @@ export default function RootLayout({
return (
<html>
<ContentSecurityPolicy />
<body>{children}</body>
<body>
{children}
<Toaster />
</body>
</html>
);
}

View File

@@ -24,16 +24,52 @@ import {
MagnifyingGlass,
NoteBlank,
NotionLogo,
Eye,
Trash,
ArrowsOutSimple,
DotsThreeVertical,
Waveform,
Plus,
} from "@phosphor-icons/react";
import { Button } from "@/components/ui/button";
import Link from "next/link";
import { getIconFromFilename } from "../common/iconUtils";
import { useIsMobileWidth } from "../common/utils";
import { formatDateTime, useIsMobileWidth } from "../common/utils";
import { SidebarInset, SidebarProvider, SidebarTrigger } from "@/components/ui/sidebar";
import { AppSidebar } from "../components/appSidebar/appSidebar";
import { Separator } from "@/components/ui/separator";
import { KhojLogoType } from "../components/logo/khojLogo";
import { InlineLoading } from "../components/loading/loading";
import {
AlertDialog,
AlertDialogContent,
AlertDialogDescription,
AlertDialogFooter,
AlertDialogHeader,
AlertDialogTitle,
AlertDialogCancel,
AlertDialogAction,
AlertDialogTrigger,
} from "@/components/ui/alert-dialog";
import {
Dialog,
DialogContent,
DialogHeader,
DialogTitle,
DialogTrigger,
} from "@/components/ui/dialog";
import { useToast } from "@/components/ui/use-toast";
import { Scroll } from "lucide-react";
import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuLabel,
DropdownMenuTrigger,
} from "@/components/ui/dropdown-menu";
import { uploadDataForIndexing } from "../common/chatFunctions";
import { CommandDialog } from "@/components/ui/command";
import { Progress } from "@/components/ui/progress";
interface AdditionalData {
file: string;
source: string;
@@ -49,6 +85,12 @@ interface SearchResult {
"corpus-id": string;
}
interface FileObject {
file_name: string;
raw_text: string;
updated_at: string;
}
function getNoteTypeIcon(source: string) {
if (source === "notion") {
return <NotionLogo className="text-muted-foreground" />;
@@ -92,7 +134,7 @@ function Note(props: NoteResultProps) {
const fileIcon = getIconFromFilename(fileName || ".txt", "h-4 w-4 inline mr-2");
return (
<Card className="bg-secondary h-full shadow-sm rounded-lg border border-muted mb-4">
<Card className="bg-secondary h-full shadow-sm rounded-lg border border-muted mb-4 animate-fade-in-up">
<CardHeader>
<CardTitle className="inline-flex gap-2">
{getNoteTypeIcon(note.additional.source)}
@@ -139,7 +181,7 @@ function focusNote(note: SearchResult) {
const fileIcon = getIconFromFilename(fileName || ".txt", "h-4 w-4 inline mr-2");
return (
<Card className="bg-secondary h-full shadow-sm rounded-lg bg-gradient-to-b from-background to-slate-50 dark:to-gray-950 border border-muted mb-4">
<Card className="bg-secondary h-full shadow-sm rounded-lg border border-muted mb-4">
<CardHeader>
<CardTitle>{fileName}</CardTitle>
</CardHeader>
@@ -167,27 +209,147 @@ function focusNote(note: SearchResult) {
);
}
const UploadFiles: React.FC<{
onClose: () => void;
setUploadedFiles: (files: string[]) => void;
}> = ({ onClose, setUploadedFiles }) => {
const [syncedFiles, setSyncedFiles] = useState<string[]>([]);
const [selectedFiles, setSelectedFiles] = useState<string[]>([]);
const [searchQuery, setSearchQuery] = useState("");
const [isDragAndDropping, setIsDragAndDropping] = useState(false);
const [warning, setWarning] = useState<string | null>(null);
const [error, setError] = useState<string | null>(null);
const [uploading, setUploading] = useState(false);
const [progressValue, setProgressValue] = useState(0);
const fileInputRef = useRef<HTMLInputElement>(null);
useEffect(() => {
if (!uploading) {
setProgressValue(0);
}
if (uploading) {
const interval = setInterval(() => {
setProgressValue((prev) => {
const increment = Math.floor(Math.random() * 5) + 1; // Generates a random number between 1 and 5
const nextValue = prev + increment;
return nextValue < 100 ? nextValue : 100; // Ensures progress does not exceed 100
});
}, 800);
return () => clearInterval(interval);
}
}, [uploading]);
const filteredFiles = syncedFiles.filter((file) =>
file.toLowerCase().includes(searchQuery.toLowerCase()),
);
function handleDragOver(event: React.DragEvent<HTMLDivElement>) {
event.preventDefault();
setIsDragAndDropping(true);
}
function handleDragLeave(event: React.DragEvent<HTMLDivElement>) {
event.preventDefault();
setIsDragAndDropping(false);
}
function handleDragAndDropFiles(event: React.DragEvent<HTMLDivElement>) {
event.preventDefault();
setIsDragAndDropping(false);
if (!event.dataTransfer.files) return;
uploadFiles(event.dataTransfer.files);
}
function openFileInput() {
if (fileInputRef && fileInputRef.current) {
fileInputRef.current.click();
}
}
function handleFileChange(event: React.ChangeEvent<HTMLInputElement>) {
if (!event.target.files) return;
uploadFiles(event.target.files);
}
function uploadFiles(files: FileList) {
uploadDataForIndexing(files, setWarning, setUploading, setError, setUploadedFiles);
}
return (
<div
className={`flex flex-col h-full`}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDragAndDropFiles}
onClick={openFileInput}
>
<input
type="file"
multiple
ref={fileInputRef}
style={{ display: "none" }}
onChange={handleFileChange}
/>
<div className="flex-none p-4">
{uploading && (
<Progress
indicatorColor="bg-slate-500"
className="w-full h-2 rounded-full"
value={progressValue}
/>
)}
</div>
<div
className={`flex-none p-4 bg-secondary border-b ${isDragAndDropping ? "animate-pulse" : ""} rounded-lg`}
>
<div className="flex items-center justify-center w-full h-32 border-2 border-dashed border-gray-300 rounded-lg">
{isDragAndDropping ? (
<div className="flex items-center justify-center w-full h-full">
<Waveform className="h-6 w-6 mr-2" />
<span>Drop files to upload</span>
</div>
) : (
<div className="flex items-center justify-center w-full h-full">
<Plus className="h-6 w-6 mr-2" />
<span>Drag and drop files here</span>
</div>
)}
</div>
</div>
</div>
);
};
export default function Search() {
const [searchQuery, setSearchQuery] = useState("");
const [searchResults, setSearchResults] = useState<SearchResult[] | null>(null);
const [searchResultsLoading, setSearchResultsLoading] = useState(false);
const [focusSearchResult, setFocusSearchResult] = useState<SearchResult | null>(null);
const [exampleQuery, setExampleQuery] = useState("");
const [files, setFiles] = useState<FileObject[]>([]);
const [error, setError] = useState<string | null>(null);
const [fileObjectsLoading, setFileObjectsLoading] = useState(true);
const searchTimeoutRef = useRef<NodeJS.Timeout | null>(null);
const [selectedFile, setSelectedFile] = useState<string | null>(null);
const [selectedFileFullText, setSelectedFileFullText] = useState<string | null>(null);
const [isDeleting, setIsDeleting] = useState(false);
const [uploadedFiles, setUploadedFiles] = useState<string[]>([]);
const [selectedFiles, setSelectedFiles] = useState<string[]>([]);
const [filteredFiles, setFilteredFiles] = useState<string[]>([]);
const { toast } = useToast();
const isMobileWidth = useIsMobileWidth();
useEffect(() => {
setExampleQuery(
naturalLanguageSearchQueryExamples[
Math.floor(Math.random() * naturalLanguageSearchQueryExamples.length)
],
);
}, []);
function search() {
if (searchResultsLoading || !searchQuery.trim()) return;
setSearchResultsLoading(true);
const apiUrl = `/api/search?q=${encodeURIComponent(searchQuery)}&client=web`;
fetch(apiUrl, {
method: "GET",
@@ -205,8 +367,69 @@ export default function Search() {
});
}
const deleteSelected = async () => {
let filesToDelete = selectedFiles.length > 0 ? selectedFiles : filteredFiles;
if (filesToDelete.length === 0) {
return;
}
try {
const response = await fetch("/api/content/files", {
method: "DELETE",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ files: filesToDelete }),
});
if (!response.ok) throw new Error("Failed to delete files");
// Update the syncedFiles state
setUploadedFiles((prevFiles) =>
prevFiles.filter((file) => !filesToDelete.includes(file)),
);
// Reset selectedFiles
setSelectedFiles([]);
} catch (error) {
console.error("Error deleting files:", error);
}
};
const fetchFiles = async () => {
try {
const response = await fetch("/api/content/all");
if (!response.ok) throw new Error("Failed to fetch files");
const filesList = await response.json();
if (Array.isArray(filesList)) {
setFiles(filesList.toSorted());
}
} catch (error) {
setError("Failed to load files");
console.error("Error fetching files:", error);
} finally {
setFileObjectsLoading(false);
}
};
const fetchSpecificFile = async (fileName: string) => {
try {
const response = await fetch(`/api/content/file?file_name=${fileName}`);
if (!response.ok) throw new Error("Failed to fetch file");
const file = await response.json();
setSelectedFileFullText(file.raw_text);
} catch (error) {
setError("Failed to load file");
console.error("Error fetching file:", error);
}
};
useEffect(() => {
if (!searchQuery.trim()) {
setSearchResults(null);
return;
}
@@ -229,6 +452,48 @@ export default function Search() {
};
}, [searchQuery]);
useEffect(() => {
if (selectedFile) {
fetchSpecificFile(selectedFile);
}
}, [selectedFile]);
useEffect(() => {
fetchFiles();
}, []);
useEffect(() => {
if (uploadedFiles.length > 0) {
fetchFiles();
}
}, [uploadedFiles]);
const handleDelete = async (fileName: string) => {
setIsDeleting(true);
try {
const response = await fetch(`/api/content/file?filename=${fileName}`, {
method: "DELETE",
});
if (!response.ok) throw new Error("Failed to delete file");
toast({
title: "File deleted",
description: `File ${fileName} has been deleted`,
variant: "default",
});
// Refresh files list
fetchFiles();
} catch (error) {
toast({
title: "Error deleting file",
description: `Failed to delete file ${fileName}`,
variant: "destructive",
});
} finally {
setIsDeleting(false);
}
};
return (
<SidebarProvider>
<AppSidebar conversationId={""} />
@@ -251,20 +516,34 @@ export default function Search() {
<div className="flex justify-between items-center border-2 border-muted p-1 gap-1 rounded-lg">
<Input
autoFocus={true}
className="border-none pl-4"
className="border-none pl-4 focus-visible:ring-transparent focus-visible:ring-offset-transparent"
onChange={(e) => setSearchQuery(e.currentTarget.value)}
onKeyDown={(e) => e.key === "Enter" && search()}
type="search"
placeholder="Search Documents"
/>
<button
className="px-2 gap-2 inline-flex items-center rounded border-l border-gray-300 hover:text-gray-500"
<Button
className="px-2 gap-2 inline-flex rounded-none items-center border-l border-gray-300 hover:text-gray-500"
variant={"ghost"}
onClick={() => search()}
>
<MagnifyingGlass className="h-4 w-4" />
<span>Find</span>
</button>
</Button>
</div>
<UploadFiles
onClose={() => {}}
setUploadedFiles={setUploadedFiles}
/>
{searchResultsLoading && (
<div className="mt-4 flex items-center justify-center">
<InlineLoading
className="mt-4"
message={"Searching"}
iconClassName="h-5 w-5"
/>
</div>
)}
{focusSearchResult && (
<div className="mt-4">
<Button
@@ -279,6 +558,7 @@ export default function Search() {
</div>
)}
{!focusSearchResult &&
!searchResultsLoading &&
searchResults &&
searchResults.length > 0 && (
<div className="mt-4 max-w-[92vw] break-all">
@@ -297,23 +577,149 @@ export default function Search() {
</ScrollArea>
</div>
)}
{searchResults == null && (
<Card className="flex flex-col items-center justify-center border-none shadow-none">
<CardHeader className="flex flex-col items-center justify-center">
<CardDescription className="border-muted-foreground border w-fit rounded-lg mb-2 text-center text-lg p-4">
<FileMagnifyingGlass
weight="fill"
className="text-muted-foreground h-10 w-10"
{searchResults === null && (
<div className="w-full mt-4">
{fileObjectsLoading && (
<div className="mt-4 flex items-center justify-center">
<InlineLoading
className="mt-4"
message={"Loading"}
iconClassName="h-5 w-5"
/>
</CardDescription>
<CardTitle className="text-center">
Search across your documents
</CardTitle>
</CardHeader>
<CardContent className="text-muted-foreground items-center justify-center text-center flex">
<Lightbulb className="inline mr-2" /> {exampleQuery}
</CardContent>
</Card>
</div>
)}
{error && <div className="text-red-500">{error}</div>}
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
{files.map((file, index) => (
<Card
key={index}
className="animate-fade-in-up bg-secondary h-52"
>
<CardHeader className="p-2">
<CardTitle
className="flex items-center gap-2"
title={file.file_name}
>
<div className="text-sm font-medium truncate hover:text-clip hover:whitespace-normal">
{file.file_name.split("/").pop()}
</div>
<DropdownMenu>
<DropdownMenuTrigger>
<Button variant={"ghost"}>
<DotsThreeVertical className="h-4 w-4" />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent className="flex flex-col gap-0 w-fit">
<DropdownMenuItem className="p-0">
<AlertDialog>
<AlertDialogTrigger>
<Button
variant={
"ghost"
}
className="flex items-center gap-2 p-1 text-sm"
>
<Trash className="h-4 w-4" />
<span className="text-xs">
Delete
</span>
</Button>
</AlertDialogTrigger>
<AlertDialogContent>
<AlertDialogHeader>
<AlertDialogTitle>
Delete File
</AlertDialogTitle>
</AlertDialogHeader>
<AlertDialogDescription>
Are you sure you
want to delete
this file?
</AlertDialogDescription>
<AlertDialogFooter>
<AlertDialogCancel>
Cancel
</AlertDialogCancel>
<AlertDialogAction
onClick={() =>
handleDelete(
file.file_name,
)
}
>
{isDeleting
? "Deleting..."
: "Delete"}
</AlertDialogAction>
</AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
</DropdownMenuItem>
<DropdownMenuItem className="p-0">
<Dialog>
<DialogTrigger>
<Button
variant={
"ghost"
}
className="flex items-center gap-2 p-1 text-sm"
onClick={() => {
setSelectedFileFullText(
null,
);
setSelectedFile(
file.file_name,
);
}}
>
<ArrowsOutSimple className="h-4 w-4" />
<span className="text-xs">
View Full
Text
</span>
</Button>
</DialogTrigger>
<DialogContent>
<DialogHeader>
<DialogTitle>
{file.file_name
.split(
"/",
)
.pop()}
</DialogTitle>
</DialogHeader>
<ScrollArea className="h-[50vh]">
<p className="whitespace-pre-wrap break-words text-sm font-normal">
{
selectedFileFullText
}
</p>
</ScrollArea>
</DialogContent>
</Dialog>
</DropdownMenuItem>
</DropdownMenuContent>
</DropdownMenu>
</CardTitle>
</CardHeader>
<CardContent className="p-2">
<ScrollArea className="h-24">
<p className="whitespace-pre-wrap break-words text-sm font-normal text-muted-foreground p-2 rounded-lg bg-background">
{file.raw_text.slice(0, 100)}...
</p>
</ScrollArea>
</CardContent>
<CardFooter className="flex justify-end gap-2 p-2">
<div className="text-muted-foreground text-xs">
{formatDateTime(file.updated_at)}
</div>
</CardFooter>
</Card>
))}
</div>
</div>
)}
{searchResults && searchResults.length === 0 && (
<Card className="flex flex-col items-center justify-center border-none shadow-none">

View File

@@ -0,0 +1,49 @@
from django.core.management.base import BaseCommand
from django.db.models import Exists, OuterRef
from khoj.database.models import Entry, FileObject
class Command(BaseCommand):
help = "Deletes FileObjects that have no associated Entries"
def add_arguments(self, parser):
parser.add_argument(
"--apply",
action="store_true",
help="Actually perform the deletion. Without this flag, only shows what would be deleted.",
)
def handle(self, *args, **options):
# Find FileObjects with no related entries using subquery
orphaned_files = FileObject.objects.annotate(
has_entries=Exists(Entry.objects.filter(file_object=OuterRef("pk")))
).filter(has_entries=False)
total_orphaned = orphaned_files.count()
mode = "DELETE" if options["apply"] else "DRY RUN"
self.stdout.write(f"[{mode}] Found {total_orphaned} orphaned FileObjects")
if total_orphaned == 0:
self.stdout.write("No orphaned FileObjects to process")
return
# Process in batches of 1000
batch_size = 1000
processed = 0
while True:
batch = orphaned_files[:batch_size]
if not batch:
break
if options["apply"]:
count = batch.delete()[0]
processed += count
self.stdout.write(f"Deleted {processed}/{total_orphaned} orphaned FileObjects")
else:
processed += len(batch)
self.stdout.write(f"Would delete {processed}/{total_orphaned} orphaned FileObjects")
action = "Deleted" if options["apply"] else "Would delete"
self.stdout.write(self.style.SUCCESS(f"{action} {processed} orphaned FileObjects"))

View File

@@ -0,0 +1,75 @@
# Generated by Django 5.0.10 on 2025-01-10 18:28
import django.db.models.deletion
from django.db import migrations, models
def migrate_entry_objects(apps, schema_editor):
Entry = apps.get_model("database", "Entry")
FileObject = apps.get_model("database", "FileObject")
db_alias = schema_editor.connection.alias
# Create lookup dictionary of all file objects
file_objects_map = {(fo.user_id, fo.file_name): fo for fo in FileObject.objects.using(db_alias).all()}
# Process entries in chunks of 1000
chunk_size = 1000
processed = 0
processed_entry_ids = set()
while True:
entries = list(
Entry.objects.using(db_alias)
.select_related("user")
.filter(file_object__isnull=True)
.exclude(id__in=processed_entry_ids)
.only("id", "user", "file_path")[:chunk_size]
)
if not entries:
break
processed_entry_ids.update([entry.id for entry in entries])
entries_to_update = []
for entry in entries:
try:
file_object = file_objects_map.get((entry.user_id, entry.file_path))
if file_object:
entry.file_object = file_object
entries_to_update.append(entry)
except Exception as e:
print(f"Error processing entry {entry.id}: {str(e)}")
continue
if entries_to_update:
Entry.objects.using(db_alias).bulk_update(entries_to_update, ["file_object"], batch_size=chunk_size)
processed += len(entries)
print(f"Processed {processed} entries")
def reverse_migration(apps, schema_editor):
pass
class Migration(migrations.Migration):
dependencies = [
("database", "0078_khojuser_email_verification_code_expiry"),
]
operations = [
migrations.AddField(
model_name="entry",
name="file_object",
field=models.ForeignKey(
blank=True,
default=None,
null=True,
on_delete=django.db.models.deletion.CASCADE,
to="database.fileobject",
),
),
migrations.RunPython(migrate_entry_objects, reverse_migration),
]

View File

@@ -326,6 +326,7 @@ class ProcessLock(DbBaseModel):
INDEX_CONTENT = "index_content"
SCHEDULED_JOB = "scheduled_job"
SCHEDULE_LEADER = "schedule_leader"
APPLY_MIGRATIONS = "apply_migrations"
# We need to make sure that some operations are thread-safe. To do so, add locks for potentially shared operations.
# For example, we need to make sure that only one process is updating the embeddings at a time.
@@ -658,6 +659,14 @@ class ReflectiveQuestion(DbBaseModel):
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE, default=None, null=True, blank=True)
class FileObject(DbBaseModel):
# Contains the full text of a file that has associated Entry objects
file_name = models.CharField(max_length=400, default=None, null=True, blank=True)
raw_text = models.TextField()
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE, default=None, null=True, blank=True)
agent = models.ForeignKey(Agent, on_delete=models.CASCADE, default=None, null=True, blank=True)
class Entry(DbBaseModel):
class EntryType(models.TextChoices):
IMAGE = "image"
@@ -689,20 +698,13 @@ class Entry(DbBaseModel):
hashed_value = models.CharField(max_length=100)
corpus_id = models.UUIDField(default=uuid.uuid4, editable=False)
search_model = models.ForeignKey(SearchModelConfig, on_delete=models.SET_NULL, default=None, null=True, blank=True)
file_object = models.ForeignKey(FileObject, on_delete=models.CASCADE, default=None, null=True, blank=True)
def save(self, *args, **kwargs):
if self.user and self.agent:
raise ValidationError("An Entry cannot be associated with both a user and an agent.")
class FileObject(DbBaseModel):
# Same as Entry but raw will be a much larger string
file_name = models.CharField(max_length=400, default=None, null=True, blank=True)
raw_text = models.TextField()
user = models.ForeignKey(KhojUser, on_delete=models.CASCADE, default=None, null=True, blank=True)
agent = models.ForeignKey(Agent, on_delete=models.CASCADE, default=None, null=True, blank=True)
class EntryDates(DbBaseModel):
date = models.DateField()
entry = models.ForeignKey(Entry, on_delete=models.CASCADE, related_name="embeddings_dates")

View File

@@ -152,8 +152,22 @@ class TextToEntries(ABC):
with timer("Generated embeddings for entries to add to database in", logger):
entries_to_process = [hash_to_current_entries[hashed_val] for hashed_val in hashes_to_process]
data_to_embed = [getattr(entry, key) for entry in entries_to_process]
modified_files = {entry.file for entry in entries_to_process}
embeddings += self.embeddings_model[model.name].embed_documents(data_to_embed)
file_to_file_object_map = {}
if file_to_text_map and modified_files:
with timer("Indexed text of modified file in", logger):
# create or update text of each updated file indexed on DB
for modified_file in modified_files:
raw_text = file_to_text_map[modified_file]
file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file)
if file_object:
FileObjectAdapters.update_raw_text(file_object, raw_text)
else:
file_object = FileObjectAdapters.create_file_object(user, modified_file, raw_text)
file_to_file_object_map[modified_file] = file_object
added_entries: list[DbEntry] = []
with timer("Added entries to database in", logger):
num_items = len(hashes_to_process)
@@ -165,6 +179,7 @@ class TextToEntries(ABC):
batch_embeddings_to_create: List[DbEntry] = []
for entry_hash, new_entry in entry_batch:
entry = hash_to_current_entries[entry_hash]
file_object = file_to_file_object_map.get(entry.file, None)
batch_embeddings_to_create.append(
DbEntry(
user=user,
@@ -178,6 +193,7 @@ class TextToEntries(ABC):
hashed_value=entry_hash,
corpus_id=entry.corpus_id,
search_model=model,
file_object=file_object,
)
)
try:
@@ -190,19 +206,6 @@ class TextToEntries(ABC):
logger.error(f"Error adding entries to database:\n{batch_indexing_error}\n---\n{e}", exc_info=True)
logger.debug(f"Added {len(added_entries)} {file_type} entries to database")
if file_to_text_map:
with timer("Indexed text of modified file in", logger):
# get the set of modified files from added_entries
modified_files = {entry.file_path for entry in added_entries}
# create or update text of each updated file indexed on DB
for modified_file in modified_files:
raw_text = file_to_text_map[modified_file]
file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file)
if file_object:
FileObjectAdapters.update_raw_text(file_object, raw_text)
else:
FileObjectAdapters.create_file_object(user, modified_file, raw_text)
new_dates = []
with timer("Indexed dates from added entries in", logger):
for added_entry in added_entries:

View File

@@ -22,6 +22,7 @@ from starlette.authentication import requires
from khoj.database import adapters
from khoj.database.adapters import (
EntryAdapters,
FileObjectAdapters,
get_user_github_config,
get_user_notion_config,
)
@@ -270,6 +271,8 @@ async def delete_content_files(
await EntryAdapters.adelete_entry_by_file(user, filename)
await FileObjectAdapters.adelete_file_object_by_name(user, filename)
return {"status": "ok"}
@@ -294,6 +297,8 @@ async def delete_content_file(
)
deleted_count = await EntryAdapters.adelete_entries_by_filenames(user, files.files)
for file in files.files:
await FileObjectAdapters.adelete_file_object_by_name(user, file)
return {"status": "ok", "deleted_count": deleted_count}
@@ -325,6 +330,65 @@ def get_content_types(request: Request, client: Optional[str] = None):
return list(configured_content_types & all_content_types)
@api_content.get("/all", response_model=Dict[str, str])
@requires(["authenticated"])
async def get_all_content(request: Request, client: Optional[str] = None, truncated: Optional[bool] = True):
user = request.user.object
update_telemetry_state(
request=request,
telemetry_type="api",
api="get_all_filenames",
client=client,
)
files_data = []
file_objects = await FileObjectAdapters.aget_all_file_objects(user)
for file_object in file_objects:
files_data.append(
{
"file_name": file_object.file_name,
"raw_text": file_object.raw_text[:1000] if truncated else file_object.raw_text,
"updated_at": str(file_object.updated_at),
}
)
return Response(content=json.dumps(files_data), media_type="application/json", status_code=200)
@api_content.get("/file", response_model=Dict[str, str])
@requires(["authenticated"])
async def get_file_object(
request: Request,
file_name: str,
client: Optional[str] = None,
):
user = request.user.object
file_object = (await FileObjectAdapters.aget_file_objects_by_name(user, file_name))[0]
if not file_object:
return Response(
content=json.dumps({"error": "File not found"}),
media_type="application/json",
status_code=404,
)
update_telemetry_state(
request=request,
telemetry_type="api",
api="get_file",
client=client,
)
return Response(
content=json.dumps(
{"id": file_object.id, "file_name": file_object.file_name, "raw_text": file_object.raw_text}
),
media_type="application/json",
status_code=200,
)
@api_content.get("/{content_source}", response_model=List[str])
@requires(["authenticated"])
async def get_content_source(