mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-02 13:18:18 +00:00
# Motivation A major component of useful AI systems is adaptation to the user context. This is a major reason why we'd enabled syncing knowledge bases. The next steps in this direction is to dynamically update the evolving state of the user as conversations take place across time and topics. This allows for more personalized conversations and to maintain context across conversations. # Overview This change introduces medium and long term memories in Khoj. - The scope of a conversation can be thought of as short term memory. - Medium term memory extends to the past week. - Long term memory extends to anytime in the past, where a search query results in a match. # Details - Enable user to view and manage agent generated memories from their settings page - Fully integrate the memory object into all downstream usage, from image generation, notes extraction, online search, etc. - Scope memory per agent. The default agent has access to memories created by other agents as well. - Enable users and admins to enable/disable Khoj's memory system --------- Co-authored-by: Debanjum <debanjum@gmail.com>
327 lines
9.5 KiB
Python
327 lines
9.5 KiB
Python
import glob
|
|
import logging
|
|
import os
|
|
from datetime import datetime
|
|
|
|
import factory
|
|
from asgiref.sync import sync_to_async
|
|
from django.utils.timezone import make_aware
|
|
|
|
from khoj.database.adapters import AgentAdapters
|
|
from khoj.database.models import (
|
|
Agent,
|
|
AiModelApi,
|
|
ChatMessageModel,
|
|
ChatModel,
|
|
Conversation,
|
|
KhojApiUser,
|
|
KhojUser,
|
|
ProcessLock,
|
|
SearchModelConfig,
|
|
ServerChatSettings,
|
|
Subscription,
|
|
UserConversationConfig,
|
|
UserMemory,
|
|
)
|
|
from khoj.processor.conversation.utils import message_to_log
|
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_chat_provider(default: ChatModel.ModelType | None = ChatModel.ModelType.GOOGLE):
|
|
provider = os.getenv("KHOJ_TEST_CHAT_PROVIDER")
|
|
if provider and provider in ChatModel.ModelType:
|
|
return ChatModel.ModelType(provider)
|
|
elif os.getenv("OPENAI_API_KEY"):
|
|
return ChatModel.ModelType.OPENAI
|
|
elif os.getenv("GEMINI_API_KEY"):
|
|
return ChatModel.ModelType.GOOGLE
|
|
elif os.getenv("ANTHROPIC_API_KEY"):
|
|
return ChatModel.ModelType.ANTHROPIC
|
|
else:
|
|
return default
|
|
|
|
|
|
def get_chat_api_key(provider: ChatModel.ModelType = None):
|
|
provider = provider or get_chat_provider()
|
|
if provider == ChatModel.ModelType.OPENAI:
|
|
return os.getenv("OPENAI_API_KEY")
|
|
elif provider == ChatModel.ModelType.GOOGLE:
|
|
return os.getenv("GEMINI_API_KEY")
|
|
elif provider == ChatModel.ModelType.ANTHROPIC:
|
|
return os.getenv("ANTHROPIC_API_KEY")
|
|
else:
|
|
return os.getenv("OPENAI_API_KEY") or os.getenv("GEMINI_API_KEY") or os.getenv("ANTHROPIC_API_KEY")
|
|
|
|
|
|
def generate_chat_history(message_list):
|
|
# Generate conversation logs
|
|
chat_history: list[ChatMessageModel] = []
|
|
for user_message, chat_response, context in message_list:
|
|
message_to_log(
|
|
user_message,
|
|
chat_response,
|
|
{
|
|
"context": context,
|
|
"intent": {"type": "memory", "query": user_message, "inferred-queries": [user_message]},
|
|
},
|
|
chat_history=chat_history,
|
|
)
|
|
return chat_history
|
|
|
|
|
|
def get_sample_data(type):
|
|
sample_data = {
|
|
"org": {
|
|
"elisp.org": """
|
|
* Emacs Khoj
|
|
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
|
|
|
|
** Requirements
|
|
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
|
|
|
|
** Installation
|
|
*** Direct
|
|
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
|
|
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
|
|
#+begin_src elisp
|
|
;; Khoj Package
|
|
(use-package khoj
|
|
:load-path "~/.emacs.d/lisp/khoj.el"
|
|
:bind ("C-c s" . 'khoj))
|
|
#+end_src
|
|
|
|
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
|
|
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
|
|
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
|
|
#+begin_src elisp
|
|
;; Khoj Package
|
|
(use-package khoj
|
|
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
|
|
:bind ("C-c s" . 'khoj))
|
|
#+end_src
|
|
|
|
** Usage
|
|
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
|
|
2. Enter Query in Natural Language
|
|
e.g. "What is the meaning of life?" "What are my life goals?"
|
|
3. Wait for results
|
|
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
|
|
4. (Optional) Narrow down results further
|
|
Include/Exclude specific words from results by adding to query
|
|
e.g. "What is the meaning of life? -god +none"
|
|
|
|
""",
|
|
"readme.org": """
|
|
* Khoj
|
|
/Allow natural language search on user content like notes, images using transformer based models/
|
|
|
|
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
|
|
|
|
** Dependencies
|
|
- Python3
|
|
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
|
|
|
|
** Install
|
|
#+begin_src shell
|
|
git clone https://github.com/khoj-ai/khoj && cd khoj
|
|
conda env create -f environment.yml
|
|
conda activate khoj
|
|
#+end_src""",
|
|
},
|
|
"markdown": {
|
|
"readme.markdown": """
|
|
# Khoj
|
|
Allow natural language search on user content like notes, images using transformer based models
|
|
|
|
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
|
|
|
|
## Dependencies
|
|
- Python3
|
|
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
|
|
|
|
## Install
|
|
```shell
|
|
git clone
|
|
conda env create -f environment.yml
|
|
conda activate khoj
|
|
```
|
|
"""
|
|
},
|
|
"plaintext": {
|
|
"readme.txt": """
|
|
Khoj
|
|
Allow natural language search on user content like notes, images using transformer based models
|
|
|
|
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
|
|
|
|
Dependencies
|
|
- Python3
|
|
- Miniconda
|
|
|
|
Install
|
|
git clone
|
|
conda env create -f environment.yml
|
|
conda activate khoj
|
|
"""
|
|
},
|
|
}
|
|
|
|
return sample_data[type]
|
|
|
|
|
|
def get_index_files(
|
|
input_files: list[str] = None, input_filters: list[str] | None = ["tests/data/org/*.org"]
|
|
) -> dict[str, str]:
|
|
# Input Validation
|
|
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
|
|
logger.debug("At least one of input_files or input_filter is required to be specified")
|
|
return {}
|
|
|
|
# Get files to process
|
|
absolute_files, filtered_files = set(), set()
|
|
if input_files:
|
|
absolute_files = {get_absolute_path(input_file) for input_file in input_files}
|
|
if input_filters:
|
|
filtered_files = {
|
|
filtered_file
|
|
for file_filter in input_filters
|
|
for filtered_file in glob.glob(get_absolute_path(file_filter), recursive=True)
|
|
if os.path.isfile(filtered_file)
|
|
}
|
|
|
|
all_files = sorted(absolute_files | filtered_files)
|
|
|
|
filename_to_content_map = {}
|
|
for file in all_files:
|
|
with open(file, "r", encoding="utf8") as f:
|
|
try:
|
|
filename_to_content_map[file] = f.read()
|
|
except Exception as e:
|
|
logger.warning(f"Unable to read file: {file}. Skipping file.")
|
|
logger.warning(e, exc_info=True)
|
|
|
|
return filename_to_content_map
|
|
|
|
|
|
class UserFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = KhojUser
|
|
|
|
username = factory.Faker("name")
|
|
email = factory.Faker("email")
|
|
password = factory.Faker("password")
|
|
uuid = factory.Faker("uuid4")
|
|
|
|
|
|
class ApiUserFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = KhojApiUser
|
|
|
|
user = None
|
|
name = factory.Faker("name")
|
|
token = factory.Faker("password")
|
|
|
|
|
|
class AiModelApiFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = AiModelApi
|
|
|
|
api_key = get_chat_api_key()
|
|
|
|
|
|
class ChatModelFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = ChatModel
|
|
|
|
max_prompt_size = 20000
|
|
tokenizer = None
|
|
name = "gemini-2.5-flash"
|
|
model_type = get_chat_provider()
|
|
ai_model_api = factory.LazyAttribute(lambda obj: AiModelApiFactory() if get_chat_api_key() else None)
|
|
|
|
|
|
class UserConversationProcessorConfigFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = UserConversationConfig
|
|
|
|
user = factory.SubFactory(UserFactory)
|
|
setting = factory.SubFactory(ChatModelFactory)
|
|
|
|
|
|
class ConversationFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = Conversation
|
|
|
|
user = factory.SubFactory(UserFactory)
|
|
|
|
|
|
class SearchModelFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = SearchModelConfig
|
|
|
|
name = "default"
|
|
model_type = "text"
|
|
bi_encoder = "thenlper/gte-small"
|
|
cross_encoder = "mixedbread-ai/mxbai-rerank-xsmall-v1"
|
|
|
|
|
|
class SubscriptionFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = Subscription
|
|
|
|
user = factory.SubFactory(UserFactory)
|
|
type = Subscription.Type.STANDARD
|
|
is_recurring = False
|
|
renewal_date = make_aware(datetime.strptime("2100-04-01", "%Y-%m-%d"))
|
|
|
|
|
|
class ProcessLockFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = ProcessLock
|
|
|
|
name = "test_lock"
|
|
|
|
|
|
class ServerChatSettingsFactory(factory.django.DjangoModelFactory):
|
|
class Meta:
|
|
model = ServerChatSettings
|
|
|
|
memory_mode = ServerChatSettings.MemoryMode.ENABLED_DEFAULT_ON
|
|
|
|
|
|
# Async-safe wrappers for factories and ORM operations
|
|
async def acreate_user():
|
|
return await sync_to_async(UserFactory)()
|
|
|
|
|
|
async def acreate_subscription(user):
|
|
return await sync_to_async(SubscriptionFactory)(user=user)
|
|
|
|
|
|
async def acreate_chat_model():
|
|
return await sync_to_async(ChatModelFactory)()
|
|
|
|
|
|
async def acreate_default_agent():
|
|
return await sync_to_async(AgentAdapters.create_default_agent)()
|
|
|
|
|
|
async def acreate_agent(name, chat_model, personality):
|
|
return await sync_to_async(Agent.objects.create)(
|
|
name=name,
|
|
chat_model=chat_model,
|
|
personality=personality,
|
|
)
|
|
|
|
|
|
async def acreate_test_memory(user, agent=None, raw_text="test memory"):
|
|
"""Create a memory directly in DB without embeddings for testing."""
|
|
return await sync_to_async(UserMemory.objects.create)(
|
|
user=user,
|
|
agent=agent,
|
|
raw=raw_text,
|
|
embeddings=[0.1] * 384, # Dummy embeddings
|
|
)
|