Rename TextEmbeddings to TextEntries for improved readability

Improves readability as name has closer match to underlying constructs
2026-03-04 13:20:17 +00:00 · 2023-10-31 18:55:59 -07:00
parent bcbee05a9e
commit 87e6b1eab9
9 changed files with 24 additions and 24 deletions
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -12,14 +12,14 @@ from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from database.models import Entry as DbEntry, GithubConfig, KhojUser


 logger = logging.getLogger(__name__)


-class GithubToJsonl(TextEmbeddings):
+class GithubToJsonl(TextEntries):
    def __init__(self, config: GithubConfig):
        super().__init__(config)
        raw_repos = config.githubrepoconfig.all()
@@ -94,7 +94,7 @@ class GithubToJsonl(TextEmbeddings):
            current_entries += issue_entries

        with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
-            current_entries = TextEmbeddings.split_entries_by_max_tokens(current_entries, max_tokens=256)
+            current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)

        return current_entries

--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import Tuple, List

 # Internal Packages
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from khoj.utils.helpers import timer
 from khoj.utils.constants import empty_escape_sequences
 from khoj.utils.rawconfig import Entry
@@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser
 logger = logging.getLogger(__name__)


-class MarkdownToJsonl(TextEmbeddings):
+class MarkdownToJsonl(TextEntries):
    def __init__(self):
        super().__init__()

--- a/src/khoj/processor/notion/notion_to_jsonl.py
+++ b/src/khoj/processor/notion/notion_to_jsonl.py
@@ -8,7 +8,7 @@ import requests
 # Internal Packages
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry, NotionContentConfig
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from khoj.utils.rawconfig import Entry
 from database.models import Entry as DbEntry, KhojUser, NotionConfig

@@ -50,7 +50,7 @@ class NotionBlockType(Enum):
    CALLOUT = "callout"


-class NotionToJsonl(TextEmbeddings):
+class NotionToJsonl(TextEntries):
    def __init__(self, config: NotionConfig):
        super().__init__(config)
        self.config = NotionContentConfig(
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple

 # Internal Packages
 from khoj.processor.org_mode import orgnode
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry
 from khoj.utils import state
@@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
 logger = logging.getLogger(__name__)


-class OrgToJsonl(TextEmbeddings):
+class OrgToJsonl(TextEntries):
    def __init__(self):
        super().__init__()

--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@@ -8,7 +8,7 @@ import base64
 from langchain.document_loaders import PyMuPDFLoader

 # Internal Packages
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry
 from database.models import Entry as DbEntry, KhojUser
@@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser
 logger = logging.getLogger(__name__)


-class PdfToJsonl(TextEmbeddings):
+class PdfToJsonl(TextEntries):
    def __init__(self):
        super().__init__()

--- a/src/khoj/processor/plaintext/plaintext_to_jsonl.py
+++ b/src/khoj/processor/plaintext/plaintext_to_jsonl.py
@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup


 # Internal Packages
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import Entry
 from database.models import Entry as DbEntry, KhojUser
@@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
 logger = logging.getLogger(__name__)


-class PlaintextToJsonl(TextEmbeddings):
+class PlaintextToJsonl(TextEntries):
    def __init__(self):
        super().__init__()

--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -19,7 +19,7 @@ from database.adapters import EntryAdapters
 logger = logging.getLogger(__name__)


-class TextEmbeddings(ABC):
+class TextEntries(ABC):
    def __init__(self, config: Any = None):
        self.embeddings_model = EmbeddingsModel()
        self.config = config
@@ -85,10 +85,10 @@ class TextEmbeddings(ABC):
    ):
        with timer("Construct current entry hashes", logger):
            hashes_by_file = dict[str, set[str]]()
-            current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
+            current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
            hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
            for entry in tqdm(current_entries, desc="Hashing Entries"):
-                hashes_by_file.setdefault(entry.file, set()).add(TextEmbeddings.hash_func(key)(entry))
+                hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry))

        num_deleted_embeddings = 0
        with timer("Preparing dataset for regeneration", logger):
@@ -180,11 +180,11 @@ class TextEmbeddings(ABC):
    ):
        # Hash all current and previous entries to identify new entries
        with timer("Hash previous, current entries", logger):
-            current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
-            previous_entry_hashes = list(map(TextEmbeddings.hash_func(key), previous_entries))
+            current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
+            previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries))
            if deletion_filenames is not None:
                deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
-                deletion_entry_hashes = list(map(TextEmbeddings.hash_func(key), deletion_entries))
+                deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries))
            else:
                deletion_entry_hashes = []

--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder
 from khoj.utils.state import SearchType
 from khoj.utils.rawconfig import SearchResponse, Entry
 from khoj.utils.jsonl import load_jsonl
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from database.adapters import EntryAdapters
 from database.models import KhojUser, Entry as DbEntry

@@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):


 def setup(
-    text_to_jsonl: Type[TextEmbeddings],
+    text_to_jsonl: Type[TextEntries],
    files: dict[str, str],
    regenerate: bool,
    full_corpus: bool = True,
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -4,7 +4,7 @@ import os

 # Internal Packages
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
-from khoj.processor.text_to_jsonl import TextEmbeddings
+from khoj.processor.text_to_jsonl import TextEntries
 from khoj.utils.helpers import is_none_or_empty
 from khoj.utils.rawconfig import Entry
 from khoj.utils.fs_syncer import get_org_files
@@ -63,7 +63,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):

    # Split each entry from specified Org files by max words
    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
-        TextEmbeddings.split_entries_by_max_tokens(
+        TextEntries.split_entries_by_max_tokens(
            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
        )
    )
@@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():

    # Act
    # Split entry by max words and drop words larger than max word length
-    processed_entry = TextEmbeddings.split_entries_by_max_tokens([entry], max_word_length=5)[0]
+    processed_entry = TextEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]

    # Assert
    # "Heading" dropped from compiled version because its over the set max word limit