mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-09 05:39:12 +00:00
[Multi-User Part 7]: Improve Sign-In UX & Rename DB Models for Readability (#528)
### ✨ New - Create profile pic drop-down menu in navigation pane Put settings page, logout action under drop-down menu ### ⚙️ Fix - Add Key icon for API keys table on Web Client's settings page ### 🧪 Improve - Rename `TextEmbeddings` to `TextEntries` for improved readability - Rename `Db.Models` `Embeddings`, `EmbeddingsAdapter` to `Entry`, `EntryAdapter` - Show truncated API key for identification & restrict table width for config page responsiveness
This commit is contained in:
@@ -27,7 +27,7 @@ from database.models import (
|
|||||||
KhojApiUser,
|
KhojApiUser,
|
||||||
NotionConfig,
|
NotionConfig,
|
||||||
GithubConfig,
|
GithubConfig,
|
||||||
Embeddings,
|
Entry,
|
||||||
GithubRepoConfig,
|
GithubRepoConfig,
|
||||||
Conversation,
|
Conversation,
|
||||||
ConversationProcessorConfig,
|
ConversationProcessorConfig,
|
||||||
@@ -286,54 +286,54 @@ class ConversationAdapters:
|
|||||||
return await OpenAIProcessorConversationConfig.objects.filter(user=user).afirst()
|
return await OpenAIProcessorConversationConfig.objects.filter(user=user).afirst()
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingsAdapters:
|
class EntryAdapters:
|
||||||
word_filer = WordFilter()
|
word_filer = WordFilter()
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
date_filter = DateFilter()
|
date_filter = DateFilter()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def does_embedding_exist(user: KhojUser, hashed_value: str) -> bool:
|
def does_entry_exist(user: KhojUser, hashed_value: str) -> bool:
|
||||||
return Embeddings.objects.filter(user=user, hashed_value=hashed_value).exists()
|
return Entry.objects.filter(user=user, hashed_value=hashed_value).exists()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_embedding_by_file(user: KhojUser, file_path: str):
|
def delete_entry_by_file(user: KhojUser, file_path: str):
|
||||||
deleted_count, _ = Embeddings.objects.filter(user=user, file_path=file_path).delete()
|
deleted_count, _ = Entry.objects.filter(user=user, file_path=file_path).delete()
|
||||||
return deleted_count
|
return deleted_count
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_all_embeddings(user: KhojUser, file_type: str):
|
def delete_all_entries(user: KhojUser, file_type: str):
|
||||||
deleted_count, _ = Embeddings.objects.filter(user=user, file_type=file_type).delete()
|
deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete()
|
||||||
return deleted_count
|
return deleted_count
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str):
|
def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str):
|
||||||
return Embeddings.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
|
return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_embedding_by_hash(user: KhojUser, hashed_values: List[str]):
|
def delete_entry_by_hash(user: KhojUser, hashed_values: List[str]):
|
||||||
Embeddings.objects.filter(user=user, hashed_value__in=hashed_values).delete()
|
Entry.objects.filter(user=user, hashed_value__in=hashed_values).delete()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_embeddings_by_date_filter(embeddings: BaseManager[Embeddings], start_date: date, end_date: date):
|
def get_entries_by_date_filter(entry: BaseManager[Entry], start_date: date, end_date: date):
|
||||||
return embeddings.filter(
|
return entry.filter(
|
||||||
embeddingsdates__date__gte=start_date,
|
entrydates__date__gte=start_date,
|
||||||
embeddingsdates__date__lte=end_date,
|
entrydates__date__lte=end_date,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def user_has_embeddings(user: KhojUser):
|
async def user_has_entries(user: KhojUser):
|
||||||
return await Embeddings.objects.filter(user=user).aexists()
|
return await Entry.objects.filter(user=user).aexists()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
|
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
|
||||||
q_filter_terms = Q()
|
q_filter_terms = Q()
|
||||||
|
|
||||||
explicit_word_terms = EmbeddingsAdapters.word_filer.get_filter_terms(query)
|
explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query)
|
||||||
file_filters = EmbeddingsAdapters.file_filter.get_filter_terms(query)
|
file_filters = EntryAdapters.file_filter.get_filter_terms(query)
|
||||||
date_filters = EmbeddingsAdapters.date_filter.get_query_date_range(query)
|
date_filters = EntryAdapters.date_filter.get_query_date_range(query)
|
||||||
|
|
||||||
if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
|
if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
|
||||||
return Embeddings.objects.filter(user=user)
|
return Entry.objects.filter(user=user)
|
||||||
|
|
||||||
for term in explicit_word_terms:
|
for term in explicit_word_terms:
|
||||||
if term.startswith("+"):
|
if term.startswith("+"):
|
||||||
@@ -354,32 +354,32 @@ class EmbeddingsAdapters:
|
|||||||
if min_date is not None:
|
if min_date is not None:
|
||||||
# Convert the min_date timestamp to yyyy-mm-dd format
|
# Convert the min_date timestamp to yyyy-mm-dd format
|
||||||
formatted_min_date = date.fromtimestamp(min_date).strftime("%Y-%m-%d")
|
formatted_min_date = date.fromtimestamp(min_date).strftime("%Y-%m-%d")
|
||||||
q_filter_terms &= Q(embeddings_dates__date__gte=formatted_min_date)
|
q_filter_terms &= Q(entry_dates__date__gte=formatted_min_date)
|
||||||
if max_date is not None:
|
if max_date is not None:
|
||||||
# Convert the max_date timestamp to yyyy-mm-dd format
|
# Convert the max_date timestamp to yyyy-mm-dd format
|
||||||
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
|
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
|
||||||
q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date)
|
q_filter_terms &= Q(entry_dates__date__lte=formatted_max_date)
|
||||||
|
|
||||||
relevant_embeddings = Embeddings.objects.filter(user=user).filter(
|
relevant_entries = Entry.objects.filter(user=user).filter(
|
||||||
q_filter_terms,
|
q_filter_terms,
|
||||||
)
|
)
|
||||||
if file_type_filter:
|
if file_type_filter:
|
||||||
relevant_embeddings = relevant_embeddings.filter(file_type=file_type_filter)
|
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
||||||
return relevant_embeddings
|
return relevant_entries
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def search_with_embeddings(
|
def search_with_embeddings(
|
||||||
user: KhojUser, embeddings: Tensor, max_results: int = 10, file_type_filter: str = None, raw_query: str = None
|
user: KhojUser, embeddings: Tensor, max_results: int = 10, file_type_filter: str = None, raw_query: str = None
|
||||||
):
|
):
|
||||||
relevant_embeddings = EmbeddingsAdapters.apply_filters(user, raw_query, file_type_filter)
|
relevant_entries = EntryAdapters.apply_filters(user, raw_query, file_type_filter)
|
||||||
relevant_embeddings = relevant_embeddings.filter(user=user).annotate(
|
relevant_entries = relevant_entries.filter(user=user).annotate(
|
||||||
distance=CosineDistance("embeddings", embeddings)
|
distance=CosineDistance("embeddings", embeddings)
|
||||||
)
|
)
|
||||||
if file_type_filter:
|
if file_type_filter:
|
||||||
relevant_embeddings = relevant_embeddings.filter(file_type=file_type_filter)
|
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
||||||
relevant_embeddings = relevant_embeddings.order_by("distance")
|
relevant_entries = relevant_entries.order_by("distance")
|
||||||
return relevant_embeddings[:max_results]
|
return relevant_entries[:max_results]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_unique_file_types(user: KhojUser):
|
def get_unique_file_types(user: KhojUser):
|
||||||
return Embeddings.objects.filter(user=user).values_list("file_type", flat=True).distinct()
|
return Entry.objects.filter(user=user).values_list("file_type", flat=True).distinct()
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Generated by Django 4.2.5 on 2023-10-26 23:52
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
dependencies = [
|
||||||
|
("database", "0009_khojapiuser"),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RenameModel(
|
||||||
|
old_name="Embeddings",
|
||||||
|
new_name="Entry",
|
||||||
|
),
|
||||||
|
migrations.RenameModel(
|
||||||
|
old_name="EmbeddingsDates",
|
||||||
|
new_name="EntryDates",
|
||||||
|
),
|
||||||
|
migrations.RenameField(
|
||||||
|
model_name="entrydates",
|
||||||
|
old_name="embeddings",
|
||||||
|
new_name="entry",
|
||||||
|
),
|
||||||
|
migrations.RenameIndex(
|
||||||
|
model_name="entrydates",
|
||||||
|
new_name="database_en_date_8d823c_idx",
|
||||||
|
old_name="database_em_date_a1ba47_idx",
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -114,8 +114,8 @@ class Conversation(BaseModel):
|
|||||||
conversation_log = models.JSONField(default=dict)
|
conversation_log = models.JSONField(default=dict)
|
||||||
|
|
||||||
|
|
||||||
class Embeddings(BaseModel):
|
class Entry(BaseModel):
|
||||||
class EmbeddingsType(models.TextChoices):
|
class EntryType(models.TextChoices):
|
||||||
IMAGE = "image"
|
IMAGE = "image"
|
||||||
PDF = "pdf"
|
PDF = "pdf"
|
||||||
PLAINTEXT = "plaintext"
|
PLAINTEXT = "plaintext"
|
||||||
@@ -130,7 +130,7 @@ class Embeddings(BaseModel):
|
|||||||
raw = models.TextField()
|
raw = models.TextField()
|
||||||
compiled = models.TextField()
|
compiled = models.TextField()
|
||||||
heading = models.CharField(max_length=1000, default=None, null=True, blank=True)
|
heading = models.CharField(max_length=1000, default=None, null=True, blank=True)
|
||||||
file_type = models.CharField(max_length=30, choices=EmbeddingsType.choices, default=EmbeddingsType.PLAINTEXT)
|
file_type = models.CharField(max_length=30, choices=EntryType.choices, default=EntryType.PLAINTEXT)
|
||||||
file_path = models.CharField(max_length=400, default=None, null=True, blank=True)
|
file_path = models.CharField(max_length=400, default=None, null=True, blank=True)
|
||||||
file_name = models.CharField(max_length=400, default=None, null=True, blank=True)
|
file_name = models.CharField(max_length=400, default=None, null=True, blank=True)
|
||||||
url = models.URLField(max_length=400, default=None, null=True, blank=True)
|
url = models.URLField(max_length=400, default=None, null=True, blank=True)
|
||||||
@@ -138,9 +138,9 @@ class Embeddings(BaseModel):
|
|||||||
corpus_id = models.UUIDField(default=uuid.uuid4, editable=False)
|
corpus_id = models.UUIDField(default=uuid.uuid4, editable=False)
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingsDates(BaseModel):
|
class EntryDates(BaseModel):
|
||||||
date = models.DateField()
|
date = models.DateField()
|
||||||
embeddings = models.ForeignKey(Embeddings, on_delete=models.CASCADE, related_name="embeddings_dates")
|
entry = models.ForeignKey(Entry, on_delete=models.CASCADE, related_name="embeddings_dates")
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
indexes = [
|
indexes = [
|
||||||
|
|||||||
4
src/khoj/interface/web/assets/icons/key.svg
Normal file
4
src/khoj/interface/web/assets/icons/key.svg
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<svg width="800px" height="800px" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||||
|
<path fill-rule="evenodd" clip-rule="evenodd" d="M22 8.29344C22 11.7692 19.1708 14.5869 15.6807 14.5869C15.0439 14.5869 13.5939 14.4405 12.8885 13.8551L12.0067 14.7333C11.4883 15.2496 11.6283 15.4016 11.8589 15.652C11.9551 15.7565 12.0672 15.8781 12.1537 16.0505C12.1537 16.0505 12.8885 17.075 12.1537 18.0995C11.7128 18.6849 10.4783 19.5045 9.06754 18.0995L8.77362 18.3922C8.77362 18.3922 9.65538 19.4167 8.92058 20.4412C8.4797 21.0267 7.30403 21.6121 6.27531 20.5876L5.2466 21.6121C4.54119 22.3146 3.67905 21.9048 3.33616 21.6121L2.45441 20.7339C1.63143 19.9143 2.1115 19.0264 2.45441 18.6849L10.0963 11.0743C10.0963 11.0743 9.3615 9.90338 9.3615 8.29344C9.3615 4.81767 12.1907 2 15.6807 2C19.1708 2 22 4.81767 22 8.29344ZM15.681 10.4889C16.8984 10.4889 17.8853 9.50601 17.8853 8.29353C17.8853 7.08105 16.8984 6.09814 15.681 6.09814C14.4635 6.09814 13.4766 7.08105 13.4766 8.29353C13.4766 9.50601 14.4635 10.4889 15.681 10.4889Z" fill="#1C274C"/>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.1 KiB |
@@ -6,6 +6,8 @@
|
|||||||
--primary-hover: #ffa000;
|
--primary-hover: #ffa000;
|
||||||
--primary-focus: rgba(255, 179, 0, 0.125);
|
--primary-focus: rgba(255, 179, 0, 0.125);
|
||||||
--primary-inverse: rgba(0, 0, 0, 0.75);
|
--primary-inverse: rgba(0, 0, 0, 0.75);
|
||||||
|
--background-color: #fff;
|
||||||
|
--main-text-color: #475569;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Amber Dark scheme (Auto) */
|
/* Amber Dark scheme (Auto) */
|
||||||
@@ -16,6 +18,8 @@
|
|||||||
--primary-hover: #ffc107;
|
--primary-hover: #ffc107;
|
||||||
--primary-focus: rgba(255, 179, 0, 0.25);
|
--primary-focus: rgba(255, 179, 0, 0.25);
|
||||||
--primary-inverse: rgba(0, 0, 0, 0.75);
|
--primary-inverse: rgba(0, 0, 0, 0.75);
|
||||||
|
--background-color: #fff;
|
||||||
|
--main-text-color: #475569;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Amber Dark scheme (Forced) */
|
/* Amber Dark scheme (Forced) */
|
||||||
@@ -25,6 +29,8 @@
|
|||||||
--primary-hover: #ffc107;
|
--primary-hover: #ffc107;
|
||||||
--primary-focus: rgba(255, 179, 0, 0.25);
|
--primary-focus: rgba(255, 179, 0, 0.25);
|
||||||
--primary-inverse: rgba(0, 0, 0, 0.75);
|
--primary-inverse: rgba(0, 0, 0, 0.75);
|
||||||
|
--background-color: #fff;
|
||||||
|
--main-text-color: #475569;
|
||||||
}
|
}
|
||||||
/* Amber (Common styles) */
|
/* Amber (Common styles) */
|
||||||
:root {
|
:root {
|
||||||
@@ -37,7 +43,8 @@
|
|||||||
.khoj-configure {
|
.khoj-configure {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
padding: 0 24px;
|
font-family: roboto, karma, segoe ui, sans-serif;
|
||||||
|
font-weight: 300;
|
||||||
}
|
}
|
||||||
.khoj-header {
|
.khoj-header {
|
||||||
display: grid;
|
display: grid;
|
||||||
@@ -100,7 +107,84 @@ p#khoj-banner {
|
|||||||
display: inline;
|
display: inline;
|
||||||
}
|
}
|
||||||
|
|
||||||
@media only screen and (max-width: 600px) {
|
/* Dropdown in navigation menu*/
|
||||||
|
#khoj-nav-menu-container {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
.khoj-nav-dropdown-content {
|
||||||
|
display: block;
|
||||||
|
grid-auto-flow: row;
|
||||||
|
position: absolute;
|
||||||
|
background-color: var(--background-color);
|
||||||
|
min-width: 160px;
|
||||||
|
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
|
||||||
|
right: 15vw;
|
||||||
|
top: 64px;
|
||||||
|
z-index: 1;
|
||||||
|
opacity: 0;
|
||||||
|
transition: opacity 0.1s ease-in-out;
|
||||||
|
pointer-events: none;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
.khoj-nav-dropdown-content.show {
|
||||||
|
opacity: 1;
|
||||||
|
pointer-events: auto;
|
||||||
|
}
|
||||||
|
.khoj-nav-dropdown-content a {
|
||||||
|
color: black;
|
||||||
|
padding: 12px 16px;
|
||||||
|
text-decoration: none;
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
.khoj-nav-dropdown-content a:hover {
|
||||||
|
background-color: var(--primary-hover);
|
||||||
|
}
|
||||||
|
.khoj-nav-username {
|
||||||
|
padding: 12px 16px;
|
||||||
|
text-decoration: none;
|
||||||
|
display: block;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
.circle {
|
||||||
|
border-radius: 50%;
|
||||||
|
border: 2px solid var(--primary-inverse);
|
||||||
|
width: 40px;
|
||||||
|
height: 40px;
|
||||||
|
vertical-align: text-top;
|
||||||
|
padding: 3px;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
.circle:hover {
|
||||||
|
background-color: var(--primary-hover);
|
||||||
|
}
|
||||||
|
.user-initial {
|
||||||
|
background-color: white;
|
||||||
|
color: black;
|
||||||
|
display: grid;
|
||||||
|
justify-content: center;
|
||||||
|
align-items: center;
|
||||||
|
font-size: 20px;
|
||||||
|
box-sizing: unset;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media screen and (max-width: 700px) {
|
||||||
|
.khoj-nav-dropdown-content {
|
||||||
|
display: block;
|
||||||
|
grid-auto-flow: row;
|
||||||
|
position: absolute;
|
||||||
|
background-color: var(--background-color);
|
||||||
|
min-width: 160px;
|
||||||
|
box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
|
||||||
|
right: 10px;
|
||||||
|
z-index: 1;
|
||||||
|
opacity: 0;
|
||||||
|
transition: opacity 0.1s ease-in-out;
|
||||||
|
pointer-events: none;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@media only screen and (max-width: 700px) {
|
||||||
div.khoj-header {
|
div.khoj-header {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-auto-flow: column;
|
grid-auto-flow: column;
|
||||||
|
|||||||
15
src/khoj/interface/web/assets/khoj.js
Normal file
15
src/khoj/interface/web/assets/khoj.js
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
// Toggle the navigation menu
|
||||||
|
function toggleMenu() {
|
||||||
|
var menu = document.getElementById("khoj-nav-menu");
|
||||||
|
menu.classList.toggle("show");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the dropdown menu if the user clicks outside of it
|
||||||
|
document.addEventListener('click', function(event) {
|
||||||
|
let menu = document.getElementById("khoj-nav-menu");
|
||||||
|
let menuContainer = document.getElementById("khoj-nav-menu-container");
|
||||||
|
let isClickOnMenu = menuContainer.contains(event.target) || menuContainer === event.target;
|
||||||
|
if (isClickOnMenu === false && menu.classList.contains("show")) {
|
||||||
|
menu.classList.remove("show");
|
||||||
|
}
|
||||||
|
});
|
||||||
@@ -8,19 +8,15 @@
|
|||||||
<link rel="stylesheet" href="/static/assets/pico.min.css">
|
<link rel="stylesheet" href="/static/assets/pico.min.css">
|
||||||
<link rel="stylesheet" href="/static/assets/khoj.css">
|
<link rel="stylesheet" href="/static/assets/khoj.css">
|
||||||
</head>
|
</head>
|
||||||
|
<script type="text/javascript" src="/static/assets/khoj.js"></script>
|
||||||
<body class="khoj-configure">
|
<body class="khoj-configure">
|
||||||
<div class="khoj-header-wrapper">
|
<div class="khoj-header-wrapper">
|
||||||
<div class="filler"></div>
|
<div class="filler"></div>
|
||||||
<div class="khoj-header">
|
|
||||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
<!--Add Header Logo and Nav Pane-->
|
||||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
{% import 'utils.html' as utils %}
|
||||||
</a>
|
{{ utils.heading_pane(user_photo, username) }}
|
||||||
<nav class="khoj-nav">
|
|
||||||
<a class="khoj-nav" href="/chat">Chat</a>
|
|
||||||
<a class="khoj-nav" href="/">Search</a>
|
|
||||||
<a class="khoj-nav khoj-nav-selected" href="/config">Settings</a>
|
|
||||||
</nav>
|
|
||||||
</div>
|
|
||||||
<div class="filler"></div>
|
<div class="filler"></div>
|
||||||
</div>
|
</div>
|
||||||
<div class=”content”>
|
<div class=”content”>
|
||||||
@@ -38,10 +34,15 @@
|
|||||||
img.khoj-logo {
|
img.khoj-logo {
|
||||||
max-width: none!important;
|
max-width: none!important;
|
||||||
}
|
}
|
||||||
div.khoj-header-wrapper{
|
div.khoj-header-wrapper {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
||||||
}
|
}
|
||||||
|
img.circle {
|
||||||
|
width: 49px;
|
||||||
|
height: 49px;
|
||||||
|
}
|
||||||
|
|
||||||
.page {
|
.page {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-auto-flow: row;
|
grid-auto-flow: row;
|
||||||
@@ -233,12 +234,12 @@
|
|||||||
height: 32px;
|
height: 32px;
|
||||||
}
|
}
|
||||||
|
|
||||||
@media screen and (max-width: 600px) {
|
@media screen and (max-width: 700px) {
|
||||||
.section-cards {
|
.section-cards {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@media only screen and (max-width: 600px) {
|
@media only screen and (max-width: 700px) {
|
||||||
body {
|
body {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
@@ -264,10 +265,9 @@
|
|||||||
width: 320px;
|
width: 320px;
|
||||||
}
|
}
|
||||||
|
|
||||||
div.khoj-header-wrapper{
|
div.khoj-header-wrapper {
|
||||||
grid-template-columns: auto;
|
grid-template-columns: auto;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
<link rel="manifest" href="/static/khoj_chat.webmanifest">
|
<link rel="manifest" href="/static/khoj_chat.webmanifest">
|
||||||
<link rel="stylesheet" href="/static/assets/khoj.css">
|
<link rel="stylesheet" href="/static/assets/khoj.css">
|
||||||
</head>
|
</head>
|
||||||
|
<script type="text/javascript" src="/static/assets/khoj.js"></script>
|
||||||
<script>
|
<script>
|
||||||
let chatOptions = [];
|
let chatOptions = [];
|
||||||
function copyProgrammaticOutput(event) {
|
function copyProgrammaticOutput(event) {
|
||||||
@@ -269,25 +270,10 @@
|
|||||||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!--Add Header Logo and Nav Pane-->
|
<!--Add Header Logo and Nav Pane-->
|
||||||
<div class="khoj-header">
|
{% import 'utils.html' as utils %}
|
||||||
{% if demo %}
|
{{ utils.heading_pane(user_photo, username) }}
|
||||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
|
||||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
|
||||||
</a>
|
|
||||||
{% else %}
|
|
||||||
<a class="khoj-logo" href="/">
|
|
||||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
|
||||||
</a>
|
|
||||||
{% endif %}
|
|
||||||
<nav class="khoj-nav">
|
|
||||||
<a class="khoj-nav khoj-nav-selected" href="/chat">Chat</a>
|
|
||||||
<a class="khoj-nav" href="/">Search</a>
|
|
||||||
{% if not demo %}
|
|
||||||
<a class="khoj-nav" href="/config">Settings</a>
|
|
||||||
{% endif %}
|
|
||||||
</nav>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Chat Body -->
|
<!-- Chat Body -->
|
||||||
<div id="chat-body"></div>
|
<div id="chat-body"></div>
|
||||||
@@ -309,8 +295,8 @@
|
|||||||
}
|
}
|
||||||
body {
|
body {
|
||||||
display: grid;
|
display: grid;
|
||||||
background: #fff;
|
background: var(--background-color);
|
||||||
color: #475569;
|
color: var(--main-text-color);
|
||||||
text-align: center;
|
text-align: center;
|
||||||
font-family: roboto, karma, segoe ui, sans-serif;
|
font-family: roboto, karma, segoe ui, sans-serif;
|
||||||
font-size: 20px;
|
font-size: 20px;
|
||||||
@@ -332,7 +318,7 @@
|
|||||||
content: attr(data-meta);
|
content: attr(data-meta);
|
||||||
display: block;
|
display: block;
|
||||||
font-size: x-small;
|
font-size: x-small;
|
||||||
color: #475569;
|
color: var(--main-text-color);
|
||||||
margin: -8px 4px 0 -5px;
|
margin: -8px 4px 0 -5px;
|
||||||
}
|
}
|
||||||
/* move message by khoj to left */
|
/* move message by khoj to left */
|
||||||
@@ -402,7 +388,7 @@
|
|||||||
top: 91%;
|
top: 91%;
|
||||||
right: -2px;
|
right: -2px;
|
||||||
border: 10px solid transparent;
|
border: 10px solid transparent;
|
||||||
border-left-color: #475569;
|
border-left-color: var(--main-text-color);
|
||||||
border-right: 0;
|
border-right: 0;
|
||||||
margin-top: -10px;
|
margin-top: -10px;
|
||||||
transform: rotate(-60deg)
|
transform: rotate(-60deg)
|
||||||
@@ -418,7 +404,7 @@
|
|||||||
#chat-footer > * {
|
#chat-footer > * {
|
||||||
padding: 15px;
|
padding: 15px;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
border: 1px solid #475569;
|
border: 1px solid var(--main-text-color);
|
||||||
background: #f9fafc
|
background: #f9fafc
|
||||||
}
|
}
|
||||||
.option:hover {
|
.option:hover {
|
||||||
@@ -451,9 +437,9 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
a.inline-chat-link {
|
a.inline-chat-link {
|
||||||
color: #475569;
|
color: var(--main-text-color);
|
||||||
text-decoration: none;
|
text-decoration: none;
|
||||||
border-bottom: 1px dotted #475569;
|
border-bottom: 1px dotted var(--main-text-color);
|
||||||
}
|
}
|
||||||
|
|
||||||
@media (pointer: coarse), (hover: none) {
|
@media (pointer: coarse), (hover: none) {
|
||||||
@@ -479,7 +465,7 @@
|
|||||||
padding: 2px 4px;
|
padding: 2px 4px;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@media only screen and (max-width: 600px) {
|
@media only screen and (max-width: 700px) {
|
||||||
body {
|
body {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
||||||
@@ -499,7 +485,7 @@
|
|||||||
padding: 0;
|
padding: 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@media only screen and (min-width: 600px) {
|
@media only screen and (min-width: 700px) {
|
||||||
body {
|
body {
|
||||||
grid-template-columns: auto min(70vw, 100%) auto;
|
grid-template-columns: auto min(70vw, 100%) auto;
|
||||||
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
grid-template-rows: auto auto minmax(80px, 100%) auto;
|
||||||
@@ -542,7 +528,7 @@
|
|||||||
input#khoj-banner-email {
|
input#khoj-banner-email {
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
border: 1px solid #475569;
|
border: 1px solid var(--main-text-color);
|
||||||
background: #f9fafc;
|
background: #f9fafc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,11 +3,6 @@
|
|||||||
|
|
||||||
<div class="page">
|
<div class="page">
|
||||||
<div class="section">
|
<div class="section">
|
||||||
{% if anonymous_mode == False %}
|
|
||||||
<div>
|
|
||||||
Logged in as {{ username }}
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
<h2 class="section-title">Plugins</h2>
|
<h2 class="section-title">Plugins</h2>
|
||||||
<div class="section-cards">
|
<div class="section-cards">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
@@ -328,11 +323,6 @@
|
|||||||
<div class="finalize-buttons">
|
<div class="finalize-buttons">
|
||||||
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
<button id="reinitialize" type="submit" title="Regenerate index from scratch">🔄 Reinitialize</button>
|
||||||
</div>
|
</div>
|
||||||
{% if anonymous_mode == False %}
|
|
||||||
<div class="finalize-buttons">
|
|
||||||
<button id="logout" class="logout" onclick="window.location.href='/auth/logout'">Logout</button>
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -541,16 +531,7 @@
|
|||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(tokenObj => {
|
.then(tokenObj => {
|
||||||
apiKeyList.innerHTML += `
|
apiKeyList.innerHTML += generateTokenRow(tokenObj);
|
||||||
<tr id="api-key-item-${tokenObj.token}">
|
|
||||||
<td><b>${tokenObj.name}</b></td>
|
|
||||||
<td id="api-key-${tokenObj.token}">${tokenObj.token}</td>
|
|
||||||
<td>
|
|
||||||
<img id="api-key-copy-button-${tokenObj.token}" onclick="copyAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key">
|
|
||||||
<img id="api-key-delete-button-${tokenObj.token}" onclick="deleteAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key">
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
`;
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -561,7 +542,7 @@
|
|||||||
const copyApiKeyButton = document.getElementById(`api-key-${token}`);
|
const copyApiKeyButton = document.getElementById(`api-key-${token}`);
|
||||||
original_html = copyApiKeyButton.innerHTML
|
original_html = copyApiKeyButton.innerHTML
|
||||||
setTimeout(function() {
|
setTimeout(function() {
|
||||||
copyApiKeyButton.innerHTML = "✅ Copied to your clipboard!";
|
copyApiKeyButton.innerHTML = "✅ Copied!";
|
||||||
setTimeout(function() {
|
setTimeout(function() {
|
||||||
copyApiKeyButton.innerHTML = original_html;
|
copyApiKeyButton.innerHTML = original_html;
|
||||||
}, 1000);
|
}, 1000);
|
||||||
@@ -581,23 +562,30 @@
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function generateTokenRow(tokenObj) {
|
||||||
|
let token = tokenObj.token;
|
||||||
|
let tokenName = tokenObj.name;
|
||||||
|
let truncatedToken = token.slice(0, 4) + "..." + token.slice(-4);
|
||||||
|
let tokenId = `${tokenName}-${truncatedToken}`;
|
||||||
|
return `
|
||||||
|
<tr id="api-key-item-${token}">
|
||||||
|
<td><b>${tokenName}</b></td>
|
||||||
|
<td id="api-key-${token}">${truncatedToken}</td>
|
||||||
|
<td>
|
||||||
|
<img onclick="copyAPIKey('${token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key" title="Copy API Key">
|
||||||
|
<img onclick="deleteAPIKey('${token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key" title="Delete API Key">
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
`;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
function listApiKeys() {
|
function listApiKeys() {
|
||||||
const apiKeyList = document.getElementById("api-key-list");
|
const apiKeyList = document.getElementById("api-key-list");
|
||||||
fetch('/auth/token')
|
fetch('/auth/token')
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(tokens => {
|
.then(tokens => {
|
||||||
apiKeyList.innerHTML = tokens.map(tokenObj =>
|
apiKeyList.innerHTML = tokens.map(generateTokenRow).join("");
|
||||||
`
|
|
||||||
<tr id="api-key-item-${tokenObj.token}">
|
|
||||||
<td><b>${tokenObj.name}</b></td>
|
|
||||||
<td id="api-key-${tokenObj.token}">${tokenObj.token}</td>
|
|
||||||
<td>
|
|
||||||
<img id="api-key-copy-button-${tokenObj.token}" onclick="copyAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/copy-solid.svg" alt="Copy API Key">
|
|
||||||
<img id="api-key-delete-button-${tokenObj.token}" onclick="deleteAPIKey('${tokenObj.token}')" class="configured-icon enabled" src="/static/assets/icons/trash-solid.svg" alt="Delete API Key">
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
`)
|
|
||||||
.join("");
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@
|
|||||||
</head>
|
</head>
|
||||||
<script type="text/javascript" src="/static/assets/org.min.js"></script>
|
<script type="text/javascript" src="/static/assets/org.min.js"></script>
|
||||||
<script type="text/javascript" src="/static/assets/markdown-it.min.js"></script>
|
<script type="text/javascript" src="/static/assets/markdown-it.min.js"></script>
|
||||||
|
<script type="text/javascript" src="/static/assets/khoj.js"></script>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function render_image(item) {
|
function render_image(item) {
|
||||||
@@ -281,25 +282,10 @@
|
|||||||
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
<button id="khoj-banner-submit" class="khoj-banner-button">Submit</button>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<!--Add Header Logo and Nav Pane-->
|
<!--Add Header Logo and Nav Pane-->
|
||||||
<div class="khoj-header">
|
{% import 'utils.html' as utils %}
|
||||||
{% if demo %}
|
{{ utils.heading_pane(user_photo, username) }}
|
||||||
<a class="khoj-logo" href="https://khoj.dev" target="_blank">
|
|
||||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
|
||||||
</a>
|
|
||||||
{% else %}
|
|
||||||
<a class="khoj-logo" href="/">
|
|
||||||
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
|
||||||
</a>
|
|
||||||
{% endif %}
|
|
||||||
<nav class="khoj-nav">
|
|
||||||
<a class="khoj-nav" href="/chat">Chat</a>
|
|
||||||
<a class="khoj-nav khoj-nav-selected" href="/">Search</a>
|
|
||||||
{% if not demo %}
|
|
||||||
<a class="khoj-nav" href="/config">Settings</a>
|
|
||||||
{% endif %}
|
|
||||||
</nav>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
|
<!--Add Text Box To Enter Query, Trigger Incremental Search OnChange -->
|
||||||
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search your knowledge base using natural language">
|
<input type="text" id="query" class="option" onkeyup=incrementalSearch(event) autofocus="autofocus" placeholder="Search your knowledge base using natural language">
|
||||||
@@ -314,7 +300,7 @@
|
|||||||
</body>
|
</body>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
@media only screen and (max-width: 600px) {
|
@media only screen and (max-width: 700px) {
|
||||||
body {
|
body {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
@@ -325,7 +311,7 @@
|
|||||||
grid-column: 1;
|
grid-column: 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@media only screen and (min-width: 600px) {
|
@media only screen and (min-width: 700px) {
|
||||||
body {
|
body {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
||||||
@@ -339,8 +325,8 @@
|
|||||||
body {
|
body {
|
||||||
padding: 0px;
|
padding: 0px;
|
||||||
margin: 0px;
|
margin: 0px;
|
||||||
background: #fff;
|
background: var(--background-color);
|
||||||
color: #475569;
|
color: var(--main-text-color);
|
||||||
font-family: roboto, karma, segoe ui, sans-serif;
|
font-family: roboto, karma, segoe ui, sans-serif;
|
||||||
font-size: 20px;
|
font-size: 20px;
|
||||||
font-weight: 300;
|
font-weight: 300;
|
||||||
@@ -358,7 +344,7 @@
|
|||||||
#options > * {
|
#options > * {
|
||||||
padding: 15px;
|
padding: 15px;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
border: 1px solid #475569;
|
border: 1px solid var(--main-text-color);
|
||||||
background: #f9fafc
|
background: #f9fafc
|
||||||
}
|
}
|
||||||
.option:hover {
|
.option:hover {
|
||||||
@@ -386,7 +372,7 @@
|
|||||||
.image {
|
.image {
|
||||||
width: 20vw;
|
width: 20vw;
|
||||||
border-radius: 10px;
|
border-radius: 10px;
|
||||||
border: 1px solid #475569;
|
border: 1px solid var(--main-text-color);
|
||||||
}
|
}
|
||||||
#json {
|
#json {
|
||||||
white-space: pre-wrap;
|
white-space: pre-wrap;
|
||||||
@@ -429,7 +415,7 @@
|
|||||||
padding: 3.5px 3.5px 0;
|
padding: 3.5px 3.5px 0;
|
||||||
margin-right: 5px;
|
margin-right: 5px;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
border: 1px solid #475569;
|
border: 1px solid var(--main-text-color);
|
||||||
background-color: #ef4444;
|
background-color: #ef4444;
|
||||||
font-size: small;
|
font-size: small;
|
||||||
}
|
}
|
||||||
@@ -500,7 +486,7 @@
|
|||||||
input#khoj-banner-email {
|
input#khoj-banner-email {
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
border: 1px solid #475569;
|
border: 1px solid var(--main-text-color);
|
||||||
background: #f9fafc;
|
background: #f9fafc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -509,7 +495,7 @@
|
|||||||
box-shadow: 0 0 11px #aaa;
|
box-shadow: 0 0 11px #aaa;
|
||||||
}
|
}
|
||||||
|
|
||||||
@media only screen and (max-width: 600px) {
|
@media only screen and (max-width: 700px) {
|
||||||
a.khoj-banner {
|
a.khoj-banner {
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,7 +58,7 @@
|
|||||||
</body>
|
</body>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
@media only screen and (max-width: 600px) {
|
@media only screen and (max-width: 700px) {
|
||||||
body {
|
body {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
@@ -69,7 +69,7 @@
|
|||||||
grid-column: 1;
|
grid-column: 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@media only screen and (min-width: 600px) {
|
@media only screen and (min-width: 700px) {
|
||||||
body {
|
body {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
grid-template-columns: 1fr min(70vw, 100%) 1fr;
|
||||||
@@ -150,7 +150,7 @@
|
|||||||
font-size: x-large;
|
font-size: x-large;
|
||||||
}
|
}
|
||||||
|
|
||||||
@media only screen and (max-width: 600px) {
|
@media only screen and (max-width: 700px) {
|
||||||
a.khoj-banner {
|
a.khoj-banner {
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|||||||
24
src/khoj/interface/web/utils.html
Normal file
24
src/khoj/interface/web/utils.html
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{% macro heading_pane(user_photo, username) -%}
|
||||||
|
<div class="khoj-header">
|
||||||
|
<a class="khoj-logo" href="/" target="_blank">
|
||||||
|
<img class="khoj-logo" src="/static/assets/icons/khoj-logo-sideways-500.png" alt="Khoj"></img>
|
||||||
|
</a>
|
||||||
|
<nav class="khoj-nav">
|
||||||
|
<a class="khoj-nav" href="/chat">💬 Chat</a>
|
||||||
|
<a class="khoj-nav" href="/">🔎 Search</a>
|
||||||
|
<!-- Dropdown Menu -->
|
||||||
|
<div id="khoj-nav-menu-container" class="khoj-nav dropdown">
|
||||||
|
{% if user_photo and user_photo != "None" %}
|
||||||
|
<img class="circle" src="{{ user_photo }}" alt="{{ username[0].upper() }}" onclick="toggleMenu()" referrerpolicy="no-referrer">
|
||||||
|
{% else %}
|
||||||
|
<div class="circle user-initial" alt="{{ username[0].upper() }}" onclick="toggleMenu()">{{ username[0].upper() }}</div>
|
||||||
|
{% endif %}
|
||||||
|
<div id="khoj-nav-menu" class="khoj-nav-dropdown-content">
|
||||||
|
<div class="khoj-nav-username"> {{ username }} </div>
|
||||||
|
<a class="khoj-nav khoj-nav-selected" href="/config">⚙️ Settings</a>
|
||||||
|
<a class="khoj-nav" href="/auth/logout">🔑 Logout</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
{%- endmacro %}
|
||||||
@@ -10,17 +10,16 @@ import requests
|
|||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.rawconfig import Entry
|
from database.models import Entry as DbEntry, GithubConfig, KhojUser
|
||||||
from database.models import Embeddings, GithubConfig, KhojUser
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class GithubToJsonl(TextEmbeddings):
|
class GithubToEntries(TextToEntries):
|
||||||
def __init__(self, config: GithubConfig):
|
def __init__(self, config: GithubConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
raw_repos = config.githubrepoconfig.all()
|
raw_repos = config.githubrepoconfig.all()
|
||||||
@@ -78,24 +77,26 @@ class GithubToJsonl(TextEmbeddings):
|
|||||||
current_entries = []
|
current_entries = []
|
||||||
|
|
||||||
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
||||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||||
*GithubToJsonl.extract_markdown_entries(markdown_files)
|
*GithubToEntries.extract_markdown_entries(markdown_files)
|
||||||
)
|
)
|
||||||
|
|
||||||
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
||||||
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))
|
current_entries += OrgToEntries.convert_org_nodes_to_entries(
|
||||||
|
*GithubToEntries.extract_org_entries(org_files)
|
||||||
|
)
|
||||||
|
|
||||||
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
||||||
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
||||||
|
|
||||||
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
||||||
issue_entries = GithubToJsonl.convert_issues_to_entries(
|
issue_entries = GithubToEntries.convert_issues_to_entries(
|
||||||
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
|
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
|
||||||
)
|
)
|
||||||
current_entries += issue_entries
|
current_entries += issue_entries
|
||||||
|
|
||||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||||
current_entries = TextEmbeddings.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
|
||||||
return current_entries
|
return current_entries
|
||||||
|
|
||||||
@@ -103,7 +104,7 @@ class GithubToJsonl(TextEmbeddings):
|
|||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||||
current_entries, Embeddings.EmbeddingsType.GITHUB, key="compiled", logger=logger, user=user
|
current_entries, DbEntry.EntryType.GITHUB, key="compiled", logger=logger, user=user
|
||||||
)
|
)
|
||||||
|
|
||||||
return num_new_embeddings, num_deleted_embeddings
|
return num_new_embeddings, num_deleted_embeddings
|
||||||
@@ -281,7 +282,7 @@ class GithubToJsonl(TextEmbeddings):
|
|||||||
entries = []
|
entries = []
|
||||||
entry_to_file_map = []
|
entry_to_file_map = []
|
||||||
for doc in markdown_files:
|
for doc in markdown_files:
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||||
doc["content"], doc["path"], entries, entry_to_file_map
|
doc["content"], doc["path"], entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
return entries, dict(entry_to_file_map)
|
return entries, dict(entry_to_file_map)
|
||||||
@@ -292,7 +293,7 @@ class GithubToJsonl(TextEmbeddings):
|
|||||||
entry_to_file_map = []
|
entry_to_file_map = []
|
||||||
|
|
||||||
for doc in org_files:
|
for doc in org_files:
|
||||||
entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
|
entries, entry_to_file_map = OrgToEntries.process_single_org_file(
|
||||||
doc["content"], doc["path"], entries, entry_to_file_map
|
doc["content"], doc["path"], entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
return entries, dict(entry_to_file_map)
|
return entries, dict(entry_to_file_map)
|
||||||
@@ -6,17 +6,17 @@ from pathlib import Path
|
|||||||
from typing import Tuple, List
|
from typing import Tuple, List
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.constants import empty_escape_sequences
|
from khoj.utils.constants import empty_escape_sequences
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Embeddings, KhojUser
|
from database.models import Entry as DbEntry, KhojUser
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MarkdownToJsonl(TextEmbeddings):
|
class MarkdownToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEmbeddings):
|
|||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
with timer("Parse entries from Markdown files into dictionaries", logger):
|
with timer("Parse entries from Markdown files into dictionaries", logger):
|
||||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||||
*MarkdownToJsonl.extract_markdown_entries(files)
|
*MarkdownToEntries.extract_markdown_entries(files)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Split entries by max tokens supported by model
|
# Split entries by max tokens supported by model
|
||||||
@@ -46,7 +46,7 @@ class MarkdownToJsonl(TextEmbeddings):
|
|||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||||
current_entries,
|
current_entries,
|
||||||
Embeddings.EmbeddingsType.MARKDOWN,
|
DbEntry.EntryType.MARKDOWN,
|
||||||
"compiled",
|
"compiled",
|
||||||
logger,
|
logger,
|
||||||
deletion_file_names,
|
deletion_file_names,
|
||||||
@@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEmbeddings):
|
|||||||
for markdown_file in markdown_files:
|
for markdown_file in markdown_files:
|
||||||
try:
|
try:
|
||||||
markdown_content = markdown_files[markdown_file]
|
markdown_content = markdown_files[markdown_file]
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||||
markdown_content, markdown_file, entries, entry_to_file_map
|
markdown_content, markdown_file, entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -8,9 +8,9 @@ import requests
|
|||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Embeddings, KhojUser, NotionConfig
|
from database.models import Entry as DbEntry, KhojUser, NotionConfig
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
@@ -50,7 +50,7 @@ class NotionBlockType(Enum):
|
|||||||
CALLOUT = "callout"
|
CALLOUT = "callout"
|
||||||
|
|
||||||
|
|
||||||
class NotionToJsonl(TextEmbeddings):
|
class NotionToEntries(TextToEntries):
|
||||||
def __init__(self, config: NotionConfig):
|
def __init__(self, config: NotionConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = NotionContentConfig(
|
self.config = NotionContentConfig(
|
||||||
@@ -250,7 +250,7 @@ class NotionToJsonl(TextEmbeddings):
|
|||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||||
current_entries, Embeddings.EmbeddingsType.NOTION, key="compiled", logger=logger, user=user
|
current_entries, DbEntry.EntryType.NOTION, key="compiled", logger=logger, user=user
|
||||||
)
|
)
|
||||||
|
|
||||||
return num_new_embeddings, num_deleted_embeddings
|
return num_new_embeddings, num_deleted_embeddings
|
||||||
@@ -5,17 +5,17 @@ from typing import Iterable, List, Tuple
|
|||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.org_mode import orgnode
|
from khoj.processor.org_mode import orgnode
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from khoj.utils import state
|
from khoj.utils import state
|
||||||
from database.models import Embeddings, KhojUser
|
from database.models import Entry as DbEntry, KhojUser
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class OrgToJsonl(TextEmbeddings):
|
class OrgToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -47,7 +47,7 @@ class OrgToJsonl(TextEmbeddings):
|
|||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||||
current_entries,
|
current_entries,
|
||||||
Embeddings.EmbeddingsType.ORG,
|
DbEntry.EntryType.ORG,
|
||||||
"compiled",
|
"compiled",
|
||||||
logger,
|
logger,
|
||||||
deletion_file_names,
|
deletion_file_names,
|
||||||
@@ -8,16 +8,16 @@ import base64
|
|||||||
from langchain.document_loaders import PyMuPDFLoader
|
from langchain.document_loaders import PyMuPDFLoader
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Embeddings, KhojUser
|
from database.models import Entry as DbEntry, KhojUser
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PdfToJsonl(TextEmbeddings):
|
class PdfToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ class PdfToJsonl(TextEmbeddings):
|
|||||||
|
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
with timer("Parse entries from PDF files into dictionaries", logger):
|
with timer("Parse entries from PDF files into dictionaries", logger):
|
||||||
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files))
|
current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files))
|
||||||
|
|
||||||
# Split entries by max tokens supported by model
|
# Split entries by max tokens supported by model
|
||||||
with timer("Split entries by max token size supported by model", logger):
|
with timer("Split entries by max token size supported by model", logger):
|
||||||
@@ -45,7 +45,7 @@ class PdfToJsonl(TextEmbeddings):
|
|||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||||
current_entries,
|
current_entries,
|
||||||
Embeddings.EmbeddingsType.PDF,
|
DbEntry.EntryType.PDF,
|
||||||
"compiled",
|
"compiled",
|
||||||
logger,
|
logger,
|
||||||
deletion_file_names,
|
deletion_file_names,
|
||||||
@@ -6,16 +6,16 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Embeddings, KhojUser
|
from database.models import Entry as DbEntry, KhojUser
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PlaintextToJsonl(TextEmbeddings):
|
class PlaintextToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||||||
try:
|
try:
|
||||||
plaintext_content = files[file]
|
plaintext_content = files[file]
|
||||||
if file.endswith(("html", "htm", "xml")):
|
if file.endswith(("html", "htm", "xml")):
|
||||||
plaintext_content = PlaintextToJsonl.extract_html_content(
|
plaintext_content = PlaintextToEntries.extract_html_content(
|
||||||
plaintext_content, file.split(".")[-1]
|
plaintext_content, file.split(".")[-1]
|
||||||
)
|
)
|
||||||
files[file] = plaintext_content
|
files[file] = plaintext_content
|
||||||
@@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||||||
|
|
||||||
# Extract Entries from specified plaintext files
|
# Extract Entries from specified plaintext files
|
||||||
with timer("Parse entries from plaintext files", logger):
|
with timer("Parse entries from plaintext files", logger):
|
||||||
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
|
current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files)
|
||||||
|
|
||||||
# Split entries by max tokens supported by model
|
# Split entries by max tokens supported by model
|
||||||
with timer("Split entries by max token size supported by model", logger):
|
with timer("Split entries by max token size supported by model", logger):
|
||||||
@@ -55,7 +55,7 @@ class PlaintextToJsonl(TextEmbeddings):
|
|||||||
with timer("Identify new or updated entries", logger):
|
with timer("Identify new or updated entries", logger):
|
||||||
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
num_new_embeddings, num_deleted_embeddings = self.update_embeddings(
|
||||||
current_entries,
|
current_entries,
|
||||||
Embeddings.EmbeddingsType.PLAINTEXT,
|
DbEntry.EntryType.PLAINTEXT,
|
||||||
key="compiled",
|
key="compiled",
|
||||||
logger=logger,
|
logger=logger,
|
||||||
deletion_filenames=deletion_file_names,
|
deletion_filenames=deletion_file_names,
|
||||||
@@ -12,14 +12,14 @@ from khoj.utils.helpers import timer, batcher
|
|||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from khoj.processor.embeddings import EmbeddingsModel
|
from khoj.processor.embeddings import EmbeddingsModel
|
||||||
from khoj.search_filter.date_filter import DateFilter
|
from khoj.search_filter.date_filter import DateFilter
|
||||||
from database.models import KhojUser, Embeddings, EmbeddingsDates
|
from database.models import KhojUser, Entry as DbEntry, EntryDates
|
||||||
from database.adapters import EmbeddingsAdapters
|
from database.adapters import EntryAdapters
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TextEmbeddings(ABC):
|
class TextToEntries(ABC):
|
||||||
def __init__(self, config: Any = None):
|
def __init__(self, config: Any = None):
|
||||||
self.embeddings_model = EmbeddingsModel()
|
self.embeddings_model = EmbeddingsModel()
|
||||||
self.config = config
|
self.config = config
|
||||||
@@ -85,23 +85,23 @@ class TextEmbeddings(ABC):
|
|||||||
):
|
):
|
||||||
with timer("Construct current entry hashes", logger):
|
with timer("Construct current entry hashes", logger):
|
||||||
hashes_by_file = dict[str, set[str]]()
|
hashes_by_file = dict[str, set[str]]()
|
||||||
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
|
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||||
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
||||||
hashes_by_file.setdefault(entry.file, set()).add(TextEmbeddings.hash_func(key)(entry))
|
hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
|
||||||
|
|
||||||
num_deleted_embeddings = 0
|
num_deleted_embeddings = 0
|
||||||
with timer("Preparing dataset for regeneration", logger):
|
with timer("Preparing dataset for regeneration", logger):
|
||||||
if regenerate:
|
if regenerate:
|
||||||
logger.debug(f"Deleting all embeddings for file type {file_type}")
|
logger.debug(f"Deleting all embeddings for file type {file_type}")
|
||||||
num_deleted_embeddings = EmbeddingsAdapters.delete_all_embeddings(user, file_type)
|
num_deleted_embeddings = EntryAdapters.delete_all_entries(user, file_type)
|
||||||
|
|
||||||
num_new_embeddings = 0
|
num_new_embeddings = 0
|
||||||
with timer("Identify hashes for adding new entries", logger):
|
with timer("Identify hashes for adding new entries", logger):
|
||||||
for file in tqdm(hashes_by_file, desc="Processing file with hashed values"):
|
for file in tqdm(hashes_by_file, desc="Processing file with hashed values"):
|
||||||
hashes_for_file = hashes_by_file[file]
|
hashes_for_file = hashes_by_file[file]
|
||||||
hashes_to_process = set()
|
hashes_to_process = set()
|
||||||
existing_entries = Embeddings.objects.filter(
|
existing_entries = DbEntry.objects.filter(
|
||||||
user=user, hashed_value__in=hashes_for_file, file_type=file_type
|
user=user, hashed_value__in=hashes_for_file, file_type=file_type
|
||||||
)
|
)
|
||||||
existing_entry_hashes = set([entry.hashed_value for entry in existing_entries])
|
existing_entry_hashes = set([entry.hashed_value for entry in existing_entries])
|
||||||
@@ -124,7 +124,7 @@ class TextEmbeddings(ABC):
|
|||||||
for entry_hash, embedding in entry_batch:
|
for entry_hash, embedding in entry_batch:
|
||||||
entry = hash_to_current_entries[entry_hash]
|
entry = hash_to_current_entries[entry_hash]
|
||||||
batch_embeddings_to_create.append(
|
batch_embeddings_to_create.append(
|
||||||
Embeddings(
|
DbEntry(
|
||||||
user=user,
|
user=user,
|
||||||
embeddings=embedding,
|
embeddings=embedding,
|
||||||
raw=entry.raw,
|
raw=entry.raw,
|
||||||
@@ -136,7 +136,7 @@ class TextEmbeddings(ABC):
|
|||||||
corpus_id=entry.corpus_id,
|
corpus_id=entry.corpus_id,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
new_embeddings = Embeddings.objects.bulk_create(batch_embeddings_to_create)
|
new_embeddings = DbEntry.objects.bulk_create(batch_embeddings_to_create)
|
||||||
logger.debug(f"Created {len(new_embeddings)} new embeddings")
|
logger.debug(f"Created {len(new_embeddings)} new embeddings")
|
||||||
num_new_embeddings += len(new_embeddings)
|
num_new_embeddings += len(new_embeddings)
|
||||||
|
|
||||||
@@ -146,26 +146,26 @@ class TextEmbeddings(ABC):
|
|||||||
dates = self.date_filter.extract_dates(embedding.raw)
|
dates = self.date_filter.extract_dates(embedding.raw)
|
||||||
for date in dates:
|
for date in dates:
|
||||||
dates_to_create.append(
|
dates_to_create.append(
|
||||||
EmbeddingsDates(
|
EntryDates(
|
||||||
date=date,
|
date=date,
|
||||||
embeddings=embedding,
|
embeddings=embedding,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
new_dates = EmbeddingsDates.objects.bulk_create(dates_to_create)
|
new_dates = EntryDates.objects.bulk_create(dates_to_create)
|
||||||
if len(new_dates) > 0:
|
if len(new_dates) > 0:
|
||||||
logger.debug(f"Created {len(new_dates)} new date entries")
|
logger.debug(f"Created {len(new_dates)} new date entries")
|
||||||
|
|
||||||
with timer("Identify hashes for removed entries", logger):
|
with timer("Identify hashes for removed entries", logger):
|
||||||
for file in hashes_by_file:
|
for file in hashes_by_file:
|
||||||
existing_entry_hashes = EmbeddingsAdapters.get_existing_entry_hashes_by_file(user, file)
|
existing_entry_hashes = EntryAdapters.get_existing_entry_hashes_by_file(user, file)
|
||||||
to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file]
|
to_delete_entry_hashes = set(existing_entry_hashes) - hashes_by_file[file]
|
||||||
num_deleted_embeddings += len(to_delete_entry_hashes)
|
num_deleted_embeddings += len(to_delete_entry_hashes)
|
||||||
EmbeddingsAdapters.delete_embedding_by_hash(user, hashed_values=list(to_delete_entry_hashes))
|
EntryAdapters.delete_entry_by_hash(user, hashed_values=list(to_delete_entry_hashes))
|
||||||
|
|
||||||
with timer("Identify hashes for deleting entries", logger):
|
with timer("Identify hashes for deleting entries", logger):
|
||||||
if deletion_filenames is not None:
|
if deletion_filenames is not None:
|
||||||
for file_path in deletion_filenames:
|
for file_path in deletion_filenames:
|
||||||
deleted_count = EmbeddingsAdapters.delete_embedding_by_file(user, file_path)
|
deleted_count = EntryAdapters.delete_entry_by_file(user, file_path)
|
||||||
num_deleted_embeddings += deleted_count
|
num_deleted_embeddings += deleted_count
|
||||||
|
|
||||||
return num_new_embeddings, num_deleted_embeddings
|
return num_new_embeddings, num_deleted_embeddings
|
||||||
@@ -180,11 +180,11 @@ class TextEmbeddings(ABC):
|
|||||||
):
|
):
|
||||||
# Hash all current and previous entries to identify new entries
|
# Hash all current and previous entries to identify new entries
|
||||||
with timer("Hash previous, current entries", logger):
|
with timer("Hash previous, current entries", logger):
|
||||||
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
|
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||||
previous_entry_hashes = list(map(TextEmbeddings.hash_func(key), previous_entries))
|
previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
|
||||||
if deletion_filenames is not None:
|
if deletion_filenames is not None:
|
||||||
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
||||||
deletion_entry_hashes = list(map(TextEmbeddings.hash_func(key), deletion_entries))
|
deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
|
||||||
else:
|
else:
|
||||||
deletion_entry_hashes = []
|
deletion_entry_hashes = []
|
||||||
|
|
||||||
@@ -48,7 +48,7 @@ from khoj.processor.conversation.gpt4all.chat_model import extract_questions_off
|
|||||||
from fastapi.requests import Request
|
from fastapi.requests import Request
|
||||||
|
|
||||||
from database import adapters
|
from database import adapters
|
||||||
from database.adapters import EmbeddingsAdapters, ConversationAdapters
|
from database.adapters import EntryAdapters, ConversationAdapters
|
||||||
from database.models import LocalMarkdownConfig, LocalOrgConfig, LocalPdfConfig, LocalPlaintextConfig, KhojUser
|
from database.models import LocalMarkdownConfig, LocalOrgConfig, LocalPdfConfig, LocalPlaintextConfig, KhojUser
|
||||||
|
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ if not state.demo:
|
|||||||
@requires(["authenticated"])
|
@requires(["authenticated"])
|
||||||
def get_config_data(request: Request):
|
def get_config_data(request: Request):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
EmbeddingsAdapters.get_unique_file_types(user)
|
EntryAdapters.get_unique_file_types(user)
|
||||||
|
|
||||||
return state.config
|
return state.config
|
||||||
|
|
||||||
@@ -145,7 +145,7 @@ if not state.demo:
|
|||||||
|
|
||||||
configuration_update_metadata = {}
|
configuration_update_metadata = {}
|
||||||
|
|
||||||
enabled_content = await sync_to_async(EmbeddingsAdapters.get_unique_file_types)(user)
|
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
|
||||||
|
|
||||||
if state.config.content_type is not None:
|
if state.config.content_type is not None:
|
||||||
configuration_update_metadata["github"] = "github" in enabled_content
|
configuration_update_metadata["github"] = "github" in enabled_content
|
||||||
@@ -241,9 +241,9 @@ if not state.demo:
|
|||||||
raise ValueError(f"Invalid content type: {content_type}")
|
raise ValueError(f"Invalid content type: {content_type}")
|
||||||
|
|
||||||
await content_object.objects.filter(user=user).adelete()
|
await content_object.objects.filter(user=user).adelete()
|
||||||
await sync_to_async(EmbeddingsAdapters.delete_all_embeddings)(user, content_type)
|
await sync_to_async(EntryAdapters.delete_all_entries)(user, content_type)
|
||||||
|
|
||||||
enabled_content = await sync_to_async(EmbeddingsAdapters.get_unique_file_types)(user)
|
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
||||||
@api.post("/delete/config/data/processor/conversation/openai", status_code=200)
|
@api.post("/delete/config/data/processor/conversation/openai", status_code=200)
|
||||||
@@ -372,7 +372,7 @@ def get_config_types(
|
|||||||
):
|
):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
|
|
||||||
enabled_file_types = EmbeddingsAdapters.get_unique_file_types(user)
|
enabled_file_types = EntryAdapters.get_unique_file_types(user)
|
||||||
|
|
||||||
configured_content_types = list(enabled_file_types)
|
configured_content_types = list(enabled_file_types)
|
||||||
|
|
||||||
@@ -706,7 +706,7 @@ async def extract_references_and_questions(
|
|||||||
if conversation_type == ConversationCommand.General:
|
if conversation_type == ConversationCommand.General:
|
||||||
return compiled_references, inferred_queries, q
|
return compiled_references, inferred_queries, q
|
||||||
|
|
||||||
if not await EmbeddingsAdapters.user_has_embeddings(user=user):
|
if not await EntryAdapters.user_has_entries(user=user):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
|
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -10,12 +10,12 @@ from starlette.authentication import requires
|
|||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils import state, constants
|
from khoj.utils import state, constants
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||||
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
from khoj.processor.notion.notion_to_entries import NotionToEntries
|
||||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from khoj.search_type import text_search, image_search
|
from khoj.search_type import text_search, image_search
|
||||||
from khoj.routers.helpers import update_telemetry_state
|
from khoj.routers.helpers import update_telemetry_state
|
||||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||||
@@ -201,7 +201,7 @@ def configure_content(
|
|||||||
logger.info("🦄 Setting up search for orgmode notes")
|
logger.info("🦄 Setting up search for orgmode notes")
|
||||||
# Extract Entries, Generate Notes Embeddings
|
# Extract Entries, Generate Notes Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
files.get("org"),
|
files.get("org"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
@@ -216,7 +216,7 @@ def configure_content(
|
|||||||
logger.info("💎 Setting up search for markdown notes")
|
logger.info("💎 Setting up search for markdown notes")
|
||||||
# Extract Entries, Generate Markdown Embeddings
|
# Extract Entries, Generate Markdown Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
MarkdownToJsonl,
|
MarkdownToEntries,
|
||||||
files.get("markdown"),
|
files.get("markdown"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
@@ -232,7 +232,7 @@ def configure_content(
|
|||||||
logger.info("🖨️ Setting up search for pdf")
|
logger.info("🖨️ Setting up search for pdf")
|
||||||
# Extract Entries, Generate PDF Embeddings
|
# Extract Entries, Generate PDF Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
PdfToJsonl,
|
PdfToEntries,
|
||||||
files.get("pdf"),
|
files.get("pdf"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
@@ -248,7 +248,7 @@ def configure_content(
|
|||||||
logger.info("📄 Setting up search for plaintext")
|
logger.info("📄 Setting up search for plaintext")
|
||||||
# Extract Entries, Generate Plaintext Embeddings
|
# Extract Entries, Generate Plaintext Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
PlaintextToJsonl,
|
PlaintextToEntries,
|
||||||
files.get("plaintext"),
|
files.get("plaintext"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
@@ -281,7 +281,7 @@ def configure_content(
|
|||||||
logger.info("🐙 Setting up search for github")
|
logger.info("🐙 Setting up search for github")
|
||||||
# Extract Entries, Generate Github Embeddings
|
# Extract Entries, Generate Github Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
GithubToJsonl,
|
GithubToEntries,
|
||||||
None,
|
None,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
@@ -298,7 +298,7 @@ def configure_content(
|
|||||||
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
|
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
|
||||||
logger.info("🔌 Setting up search for notion")
|
logger.info("🔌 Setting up search for notion")
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
NotionToJsonl,
|
NotionToEntries,
|
||||||
None,
|
None,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from khoj.utils.rawconfig import (
|
|||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils import constants, state
|
from khoj.utils import constants, state
|
||||||
from database.adapters import EmbeddingsAdapters, get_user_github_config, get_user_notion_config, ConversationAdapters
|
from database.adapters import EntryAdapters, get_user_github_config, get_user_notion_config, ConversationAdapters
|
||||||
from database.models import LocalOrgConfig, LocalMarkdownConfig, LocalPdfConfig, LocalPlaintextConfig
|
from database.models import LocalOrgConfig, LocalMarkdownConfig, LocalPdfConfig, LocalPlaintextConfig
|
||||||
|
|
||||||
|
|
||||||
@@ -34,19 +34,52 @@ VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf", "plaintext"]
|
|||||||
@web_client.get("/", response_class=FileResponse)
|
@web_client.get("/", response_class=FileResponse)
|
||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def index(request: Request):
|
def index(request: Request):
|
||||||
return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo})
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"index.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"demo": state.demo,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@web_client.post("/", response_class=FileResponse)
|
@web_client.post("/", response_class=FileResponse)
|
||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def index_post(request: Request):
|
def index_post(request: Request):
|
||||||
return templates.TemplateResponse("index.html", context={"request": request, "demo": state.demo})
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"index.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"demo": state.demo,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@web_client.get("/chat", response_class=FileResponse)
|
@web_client.get("/chat", response_class=FileResponse)
|
||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def chat_page(request: Request):
|
def chat_page(request: Request):
|
||||||
return templates.TemplateResponse("chat.html", context={"request": request, "demo": state.demo})
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
|
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"chat.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"demo": state.demo,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@web_client.get("/login", response_class=FileResponse)
|
@web_client.get("/login", response_class=FileResponse)
|
||||||
@@ -84,7 +117,8 @@ if not state.demo:
|
|||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def config_page(request: Request):
|
def config_page(request: Request):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
enabled_content = set(EmbeddingsAdapters.get_unique_file_types(user).all())
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
|
enabled_content = set(EntryAdapters.get_unique_file_types(user).all())
|
||||||
default_full_config = FullConfig(
|
default_full_config = FullConfig(
|
||||||
content_type=None,
|
content_type=None,
|
||||||
search_type=None,
|
search_type=None,
|
||||||
@@ -128,7 +162,8 @@ if not state.demo:
|
|||||||
"current_config": current_config,
|
"current_config": current_config,
|
||||||
"current_model_state": successfully_configured,
|
"current_model_state": successfully_configured,
|
||||||
"anonymous_mode": state.anonymous_mode,
|
"anonymous_mode": state.anonymous_mode,
|
||||||
"username": user.username if user else None,
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -136,6 +171,7 @@ if not state.demo:
|
|||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def github_config_page(request: Request):
|
def github_config_page(request: Request):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
current_github_config = get_user_github_config(user)
|
current_github_config = get_user_github_config(user)
|
||||||
|
|
||||||
if current_github_config:
|
if current_github_config:
|
||||||
@@ -158,13 +194,20 @@ if not state.demo:
|
|||||||
current_config = {} # type: ignore
|
current_config = {} # type: ignore
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
"content_type_github_input.html", context={"request": request, "current_config": current_config}
|
"content_type_github_input.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"current_config": current_config,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@web_client.get("/config/content_type/notion", response_class=HTMLResponse)
|
@web_client.get("/config/content_type/notion", response_class=HTMLResponse)
|
||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def notion_config_page(request: Request):
|
def notion_config_page(request: Request):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
current_notion_config = get_user_notion_config(user)
|
current_notion_config = get_user_notion_config(user)
|
||||||
|
|
||||||
current_config = NotionContentConfig(
|
current_config = NotionContentConfig(
|
||||||
@@ -174,7 +217,13 @@ if not state.demo:
|
|||||||
current_config = json.loads(current_config.json())
|
current_config = json.loads(current_config.json())
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
"content_type_notion_input.html", context={"request": request, "current_config": current_config}
|
"content_type_notion_input.html",
|
||||||
|
context={
|
||||||
|
"request": request,
|
||||||
|
"current_config": current_config,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
|
@web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
|
||||||
@@ -185,6 +234,7 @@ if not state.demo:
|
|||||||
|
|
||||||
object = map_config_to_object(content_type)
|
object = map_config_to_object(content_type)
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
config = object.objects.filter(user=user).first()
|
config = object.objects.filter(user=user).first()
|
||||||
if config == None:
|
if config == None:
|
||||||
config = object.objects.create(user=user)
|
config = object.objects.create(user=user)
|
||||||
@@ -202,6 +252,8 @@ if not state.demo:
|
|||||||
"request": request,
|
"request": request,
|
||||||
"current_config": current_config,
|
"current_config": current_config,
|
||||||
"content_type": content_type,
|
"content_type": content_type,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -209,6 +261,7 @@ if not state.demo:
|
|||||||
@requires(["authenticated"], redirect="login_page")
|
@requires(["authenticated"], redirect="login_page")
|
||||||
def conversation_processor_config_page(request: Request):
|
def conversation_processor_config_page(request: Request):
|
||||||
user = request.user.object
|
user = request.user.object
|
||||||
|
user_picture = request.session.get("user", {}).get("picture")
|
||||||
openai_config = ConversationAdapters.get_openai_conversation_config(user)
|
openai_config = ConversationAdapters.get_openai_conversation_config(user)
|
||||||
|
|
||||||
if openai_config:
|
if openai_config:
|
||||||
@@ -229,5 +282,7 @@ if not state.demo:
|
|||||||
context={
|
context={
|
||||||
"request": request,
|
"request": request,
|
||||||
"current_config": current_processor_openai_config,
|
"current_config": current_processor_openai_config,
|
||||||
|
"username": user.username,
|
||||||
|
"user_photo": user_picture,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -6,31 +6,31 @@ from typing import List, Tuple, Type, Union, Dict
|
|||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import torch
|
import torch
|
||||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
from sentence_transformers import util
|
||||||
|
|
||||||
from asgiref.sync import sync_to_async
|
from asgiref.sync import sync_to_async
|
||||||
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils import state
|
from khoj.utils import state
|
||||||
from khoj.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, timer
|
from khoj.utils.helpers import get_absolute_path, timer
|
||||||
from khoj.utils.models import BaseEncoder
|
from khoj.utils.models import BaseEncoder
|
||||||
from khoj.utils.state import SearchType
|
from khoj.utils.state import SearchType
|
||||||
from khoj.utils.rawconfig import SearchResponse, Entry
|
from khoj.utils.rawconfig import SearchResponse, Entry
|
||||||
from khoj.utils.jsonl import load_jsonl
|
from khoj.utils.jsonl import load_jsonl
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from database.adapters import EmbeddingsAdapters
|
from database.adapters import EntryAdapters
|
||||||
from database.models import KhojUser, Embeddings
|
from database.models import KhojUser, Entry as DbEntry
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
search_type_to_embeddings_type = {
|
search_type_to_embeddings_type = {
|
||||||
SearchType.Org.value: Embeddings.EmbeddingsType.ORG,
|
SearchType.Org.value: DbEntry.EntryType.ORG,
|
||||||
SearchType.Markdown.value: Embeddings.EmbeddingsType.MARKDOWN,
|
SearchType.Markdown.value: DbEntry.EntryType.MARKDOWN,
|
||||||
SearchType.Plaintext.value: Embeddings.EmbeddingsType.PLAINTEXT,
|
SearchType.Plaintext.value: DbEntry.EntryType.PLAINTEXT,
|
||||||
SearchType.Pdf.value: Embeddings.EmbeddingsType.PDF,
|
SearchType.Pdf.value: DbEntry.EntryType.PDF,
|
||||||
SearchType.Github.value: Embeddings.EmbeddingsType.GITHUB,
|
SearchType.Github.value: DbEntry.EntryType.GITHUB,
|
||||||
SearchType.Notion.value: Embeddings.EmbeddingsType.NOTION,
|
SearchType.Notion.value: DbEntry.EntryType.NOTION,
|
||||||
SearchType.All.value: None,
|
SearchType.All.value: None,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -121,7 +121,7 @@ async def query(
|
|||||||
# Find relevant entries for the query
|
# Find relevant entries for the query
|
||||||
top_k = 10
|
top_k = 10
|
||||||
with timer("Search Time", logger, state.device):
|
with timer("Search Time", logger, state.device):
|
||||||
hits = EmbeddingsAdapters.search_with_embeddings(
|
hits = EntryAdapters.search_with_embeddings(
|
||||||
user=user,
|
user=user,
|
||||||
embeddings=question_embedding,
|
embeddings=question_embedding,
|
||||||
max_results=top_k,
|
max_results=top_k,
|
||||||
@@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
|
|||||||
|
|
||||||
|
|
||||||
def setup(
|
def setup(
|
||||||
text_to_jsonl: Type[TextEmbeddings],
|
text_to_entries: Type[TextToEntries],
|
||||||
files: dict[str, str],
|
files: dict[str, str],
|
||||||
regenerate: bool,
|
regenerate: bool,
|
||||||
full_corpus: bool = True,
|
full_corpus: bool = True,
|
||||||
@@ -196,11 +196,11 @@ def setup(
|
|||||||
config=None,
|
config=None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if config:
|
if config:
|
||||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process(
|
num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
|
||||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process(
|
num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
|
||||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ app = FastAPI()
|
|||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.configure import configure_routes, configure_search_types, configure_middleware
|
from khoj.configure import configure_routes, configure_search_types, configure_middleware
|
||||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from khoj.search_type import image_search, text_search
|
from khoj.search_type import image_search, text_search
|
||||||
from khoj.utils.config import SearchModels
|
from khoj.utils.config import SearchModels
|
||||||
from khoj.utils.constants import web_directory
|
from khoj.utils.constants import web_directory
|
||||||
@@ -26,7 +26,7 @@ from khoj.utils.rawconfig import (
|
|||||||
)
|
)
|
||||||
from khoj.utils import state, fs_syncer
|
from khoj.utils import state, fs_syncer
|
||||||
from khoj.routers.indexer import configure_content
|
from khoj.routers.indexer import configure_content
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from database.models import (
|
from database.models import (
|
||||||
KhojApiUser,
|
KhojApiUser,
|
||||||
LocalOrgConfig,
|
LocalOrgConfig,
|
||||||
@@ -134,7 +134,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
|
|||||||
user=default_user,
|
user=default_user,
|
||||||
)
|
)
|
||||||
|
|
||||||
text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
|
||||||
|
|
||||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
if os.getenv("GITHUB_PAT_TOKEN"):
|
||||||
GithubConfig.objects.create(
|
GithubConfig.objects.create(
|
||||||
@@ -242,7 +242,7 @@ def client(
|
|||||||
# These lines help us Mock the Search models for these search types
|
# These lines help us Mock the Search models for these search types
|
||||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
get_sample_data("org"),
|
get_sample_data("org"),
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=api_user.user,
|
user=api_user.user,
|
||||||
@@ -251,7 +251,7 @@ def client(
|
|||||||
content_config.image, state.search_models.image_search, regenerate=False
|
content_config.image, state.search_models.image_search, regenerate=False
|
||||||
)
|
)
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
PlaintextToJsonl,
|
PlaintextToEntries,
|
||||||
get_sample_data("plaintext"),
|
get_sample_data("plaintext"),
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=api_user.user,
|
user=api_user.user,
|
||||||
|
|||||||
@@ -15,9 +15,9 @@ from khoj.utils import state
|
|||||||
from khoj.utils.state import search_models, content_index, config
|
from khoj.utils.state import search_models, content_index, config
|
||||||
from khoj.search_type import text_search, image_search
|
from khoj.search_type import text_search, image_search
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from database.models import KhojUser
|
from database.models import KhojUser
|
||||||
from database.adapters import EmbeddingsAdapters
|
from database.adapters import EntryAdapters
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
@@ -176,9 +176,9 @@ def test_regenerate_with_github_fails_without_pat(client):
|
|||||||
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
||||||
def test_get_configured_types_via_api(client, sample_org_data):
|
def test_get_configured_types_via_api(client, sample_org_data):
|
||||||
# Act
|
# Act
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False)
|
||||||
|
|
||||||
enabled_types = EmbeddingsAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
|
enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert list(enabled_types) == ["org"]
|
assert list(enabled_types) == ["org"]
|
||||||
@@ -189,7 +189,7 @@ def test_get_configured_types_via_api(client, sample_org_data):
|
|||||||
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
|
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/config/types", headers=headers)
|
response = client.get(f"/api/config/types", headers=headers)
|
||||||
@@ -255,7 +255,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
|
|||||||
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
user_query = quote("How to git install application?")
|
user_query = quote("How to git install application?")
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
@@ -276,7 +276,7 @@ def test_notes_search_with_only_filters(
|
|||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
sample_org_data,
|
sample_org_data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
@@ -298,7 +298,7 @@ def test_notes_search_with_only_filters(
|
|||||||
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
|
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
user_query = quote('How to git install application? +"Emacs"')
|
user_query = quote('How to git install application? +"Emacs"')
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
@@ -317,7 +317,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
|||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
sample_org_data,
|
sample_org_data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
@@ -339,7 +339,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
|||||||
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
|
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
|
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
user_query = quote("How to git install application?")
|
user_query = quote("How to git install application?")
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.utils.fs_syncer import get_markdown_files
|
from khoj.utils.fs_syncer import get_markdown_files
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
|
|
||||||
@@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
|
||||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
|
MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
@@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
|
||||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
@@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 2
|
assert len(entries) == 2
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import json
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import is_none_or_empty
|
from khoj.utils.helpers import is_none_or_empty
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from khoj.utils.fs_syncer import get_org_files
|
from khoj.utils.fs_syncer import get_org_files
|
||||||
@@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
|||||||
for index_heading_entries in [True, False]:
|
for index_heading_entries in [True, False]:
|
||||||
# Act
|
# Act
|
||||||
# Extract entries into jsonl from specified Org files
|
# Extract entries into jsonl from specified Org files
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(
|
OrgToEntries.convert_org_nodes_to_entries(
|
||||||
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
*OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
@@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Split each entry from specified Org files by max words
|
# Split each entry from specified Org files by max words
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||||
TextEmbeddings.split_entries_by_max_tokens(
|
TextToEntries.split_entries_by_max_tokens(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
@@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Split entry by max words and drop words larger than max word length
|
# Split entry by max words and drop words larger than max word length
|
||||||
processed_entry = TextEmbeddings.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
# "Heading" dropped from compiled version because its over the set max word limit
|
# "Heading" dropped from compiled version because its over the set max word limit
|
||||||
@@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map)
|
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
@@ -136,11 +136,11 @@ Intro text
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, _ = OrgToJsonl.extract_org_entries(org_files=data)
|
entries, _ = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 2
|
assert len(entries) == 2
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||||
|
|
||||||
from khoj.utils.fs_syncer import get_pdf_files
|
from khoj.utils.fs_syncer import get_pdf_files
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
@@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl():
|
|||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Pdf Files
|
# Process Each Entry from All Pdf Files
|
||||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
|
||||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
@@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl():
|
|||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Pdf Files
|
# Process Each Entry from All Pdf Files
|
||||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
|
||||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
|||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.fs_syncer import get_plaintext_files
|
from khoj.utils.fs_syncer import get_plaintext_files
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from database.models import LocalPlaintextConfig, KhojUser
|
from database.models import LocalPlaintextConfig, KhojUser
|
||||||
|
|
||||||
|
|
||||||
@@ -27,14 +27,14 @@ def test_plaintext_file(tmp_path):
|
|||||||
f"{plaintextfile}": entry,
|
f"{plaintextfile}": entry,
|
||||||
}
|
}
|
||||||
|
|
||||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
||||||
|
|
||||||
# Convert each entry.file to absolute path to make them JSON serializable
|
# Convert each entry.file to absolute path to make them JSON serializable
|
||||||
for map in maps:
|
for map in maps:
|
||||||
map.file = str(Path(map.file).absolute())
|
map.file = str(Path(map.file).absolute())
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
|
jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -100,7 +100,7 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
|
|||||||
extracted_plaintext_files = get_plaintext_files(config=config)
|
extracted_plaintext_files = get_plaintext_files(config=config)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(maps) == 1
|
assert len(maps) == 1
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
# System Packages
|
# System Packages
|
||||||
import logging
|
import logging
|
||||||
import locale
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
@@ -11,10 +10,10 @@ import pytest
|
|||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||||
from khoj.utils.fs_syncer import collect_files, get_org_files
|
from khoj.utils.fs_syncer import collect_files, get_org_files
|
||||||
from database.models import LocalOrgConfig, KhojUser, Embeddings, GithubConfig
|
from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -66,7 +65,7 @@ def test_text_search_setup_with_empty_file_raises_error(
|
|||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
||||||
verify_embeddings(0, default_user)
|
verify_embeddings(0, default_user)
|
||||||
@@ -81,7 +80,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
|
|||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert "Deleting all embeddings for file type org" in caplog.text
|
assert "Deleting all embeddings for file type org" in caplog.text
|
||||||
@@ -95,7 +94,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
|
|||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert "Created 4 new embeddings" in caplog.text
|
assert "Created 4 new embeddings" in caplog.text
|
||||||
@@ -113,13 +112,13 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
|
|||||||
# Act
|
# Act
|
||||||
# Generate initial notes embeddings during asymmetric setup
|
# Generate initial notes embeddings during asymmetric setup
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -149,7 +148,7 @@ async def test_text_search(search_config: SearchConfig):
|
|||||||
await loop.run_in_executor(
|
await loop.run_in_executor(
|
||||||
None,
|
None,
|
||||||
text_search.setup,
|
text_search.setup,
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
data,
|
data,
|
||||||
True,
|
True,
|
||||||
True,
|
True,
|
||||||
@@ -186,7 +185,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
|
|||||||
# Act
|
# Act
|
||||||
# reload embeddings, entries, notes model after adding new org-mode file
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
# verify newly added org-mode entry is split by max tokens
|
# verify newly added org-mode entry is split by max tokens
|
||||||
@@ -219,7 +218,7 @@ conda activate khoj
|
|||||||
#+end_src"""
|
#+end_src"""
|
||||||
}
|
}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
data,
|
data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
@@ -238,7 +237,7 @@ conda activate khoj
|
|||||||
# reload embeddings, entries, notes model after adding new org-mode file
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
data,
|
data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
full_corpus=False,
|
full_corpus=False,
|
||||||
@@ -260,7 +259,7 @@ def test_regenerate_index_with_new_entry(
|
|||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
|
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
||||||
|
|
||||||
@@ -274,7 +273,7 @@ def test_regenerate_index_with_new_entry(
|
|||||||
# Act
|
# Act
|
||||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
|
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
|
||||||
@@ -299,7 +298,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
|||||||
# Act
|
# Act
|
||||||
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
@@ -307,7 +306,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
|||||||
|
|
||||||
# update embeddings, entries, notes model with no new changes
|
# update embeddings, entries, notes model with no new changes
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -332,7 +331,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
|||||||
|
|
||||||
# load embeddings, entries, notes model after adding new org file with 2 entries
|
# load embeddings, entries, notes model after adding new org file with 2 entries
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
@@ -344,7 +343,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
|||||||
|
|
||||||
# Act
|
# Act
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -362,7 +361,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
|||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
@@ -376,7 +375,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
|||||||
# Act
|
# Act
|
||||||
# update embeddings, entries with the newly added note
|
# update embeddings, entries with the newly added note
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
@@ -394,7 +393,7 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
|
|||||||
# Act
|
# Act
|
||||||
# Regenerate github embeddings to test asymmetric setup without caching
|
# Regenerate github embeddings to test asymmetric setup without caching
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
GithubToJsonl,
|
GithubToEntries,
|
||||||
{},
|
{},
|
||||||
regenerate=True,
|
regenerate=True,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
@@ -402,10 +401,10 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
embeddings = Embeddings.objects.filter(user=default_user, file_type="github").count()
|
embeddings = Entry.objects.filter(user=default_user, file_type="github").count()
|
||||||
assert embeddings > 1
|
assert embeddings > 1
|
||||||
|
|
||||||
|
|
||||||
def verify_embeddings(expected_count, user):
|
def verify_embeddings(expected_count, user):
|
||||||
embeddings = Embeddings.objects.filter(user=user, file_type="org").count()
|
embeddings = Entry.objects.filter(user=user, file_type="org").count()
|
||||||
assert embeddings == expected_count
|
assert embeddings == expected_count
|
||||||
|
|||||||
Reference in New Issue
Block a user