diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index a548ae1b..98e771dc 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -12,14 +12,14 @@ from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from database.models import Entry as DbEntry, GithubConfig, KhojUser logger = logging.getLogger(__name__) -class GithubToJsonl(TextEmbeddings): +class GithubToJsonl(TextEntries): def __init__(self, config: GithubConfig): super().__init__(config) raw_repos = config.githubrepoconfig.all() @@ -94,7 +94,7 @@ class GithubToJsonl(TextEmbeddings): current_entries += issue_entries with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): - current_entries = TextEmbeddings.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) return current_entries diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 921f2213..86acc4b3 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Tuple, List # Internal Packages -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from khoj.utils.helpers import timer from khoj.utils.constants import empty_escape_sequences from khoj.utils.rawconfig import Entry @@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class MarkdownToJsonl(TextEmbeddings): +class MarkdownToJsonl(TextEntries): def __init__(self): super().__init__() diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_jsonl.py index 15c21b23..048642ef 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_jsonl.py @@ -8,7 +8,7 @@ import requests # Internal Packages from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, NotionContentConfig -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from khoj.utils.rawconfig import Entry from database.models import Entry as DbEntry, KhojUser, NotionConfig @@ -50,7 +50,7 @@ class NotionBlockType(Enum): CALLOUT = "callout" -class NotionToJsonl(TextEmbeddings): +class NotionToJsonl(TextEntries): def __init__(self, config: NotionConfig): super().__init__(config) self.config = NotionContentConfig( diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 9bf85660..fbb43f55 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple # Internal Packages from khoj.processor.org_mode import orgnode -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry from khoj.utils import state @@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class OrgToJsonl(TextEmbeddings): +class OrgToJsonl(TextEntries): def __init__(self): super().__init__() diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index feed12d7..034e51f4 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -8,7 +8,7 @@ import base64 from langchain.document_loaders import PyMuPDFLoader # Internal Packages -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry from database.models import Entry as DbEntry, KhojUser @@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class PdfToJsonl(TextEmbeddings): +class PdfToJsonl(TextEntries): def __init__(self): super().__init__() diff --git a/src/khoj/processor/plaintext/plaintext_to_jsonl.py b/src/khoj/processor/plaintext/plaintext_to_jsonl.py index a657ff2f..1094baa2 100644 --- a/src/khoj/processor/plaintext/plaintext_to_jsonl.py +++ b/src/khoj/processor/plaintext/plaintext_to_jsonl.py @@ -6,7 +6,7 @@ from bs4 import BeautifulSoup # Internal Packages -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry from database.models import Entry as DbEntry, KhojUser @@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class PlaintextToJsonl(TextEmbeddings): +class PlaintextToJsonl(TextEntries): def __init__(self): super().__init__() diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index 3aa6a5b1..763db9df 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -19,7 +19,7 @@ from database.adapters import EntryAdapters logger = logging.getLogger(__name__) -class TextEmbeddings(ABC): +class TextEntries(ABC): def __init__(self, config: Any = None): self.embeddings_model = EmbeddingsModel() self.config = config @@ -85,10 +85,10 @@ class TextEmbeddings(ABC): ): with timer("Construct current entry hashes", logger): hashes_by_file = dict[str, set[str]]() - current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries)) + current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries)) hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) for entry in tqdm(current_entries, desc="Hashing Entries"): - hashes_by_file.setdefault(entry.file, set()).add(TextEmbeddings.hash_func(key)(entry)) + hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry)) num_deleted_embeddings = 0 with timer("Preparing dataset for regeneration", logger): @@ -180,11 +180,11 @@ class TextEmbeddings(ABC): ): # Hash all current and previous entries to identify new entries with timer("Hash previous, current entries", logger): - current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries)) - previous_entry_hashes = list(map(TextEmbeddings.hash_func(key), previous_entries)) + current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries)) + previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries)) if deletion_filenames is not None: deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames] - deletion_entry_hashes = list(map(TextEmbeddings.hash_func(key), deletion_entries)) + deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries)) else: deletion_entry_hashes = [] diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index db3b313c..e1da9043 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder from khoj.utils.state import SearchType from khoj.utils.rawconfig import SearchResponse, Entry from khoj.utils.jsonl import load_jsonl -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from database.adapters import EntryAdapters from database.models import KhojUser, Entry as DbEntry @@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query): def setup( - text_to_jsonl: Type[TextEmbeddings], + text_to_jsonl: Type[TextEntries], files: dict[str, str], regenerate: bool, full_corpus: bool = True, diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index d47c212e..c9ccf0d6 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -4,7 +4,7 @@ import os # Internal Packages from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.text_to_jsonl import TextEmbeddings +from khoj.processor.text_to_jsonl import TextEntries from khoj.utils.helpers import is_none_or_empty from khoj.utils.rawconfig import Entry from khoj.utils.fs_syncer import get_org_files @@ -63,7 +63,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Split each entry from specified Org files by max words jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - TextEmbeddings.split_entries_by_max_tokens( + TextEntries.split_entries_by_max_tokens( OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 ) ) @@ -86,7 +86,7 @@ def test_entry_split_drops_large_words(): # Act # Split entry by max words and drop words larger than max word length - processed_entry = TextEmbeddings.split_entries_by_max_tokens([entry], max_word_length=5)[0] + processed_entry = TextEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] # Assert # "Heading" dropped from compiled version because its over the set max word limit