mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-04 13:20:17 +00:00
Rename TextEmbeddings to TextEntries for improved readability
Improves readability as name has closer match to underlying constructs
This commit is contained in:
@@ -12,14 +12,14 @@ from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from database.models import Entry as DbEntry, GithubConfig, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GithubToJsonl(TextEmbeddings):
|
||||
class GithubToJsonl(TextEntries):
|
||||
def __init__(self, config: GithubConfig):
|
||||
super().__init__(config)
|
||||
raw_repos = config.githubrepoconfig.all()
|
||||
@@ -94,7 +94,7 @@ class GithubToJsonl(TextEmbeddings):
|
||||
current_entries += issue_entries
|
||||
|
||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||
current_entries = TextEmbeddings.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
return current_entries
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
||||
from typing import Tuple, List
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.utils.rawconfig import Entry
|
||||
@@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownToJsonl(TextEmbeddings):
|
||||
class MarkdownToJsonl(TextEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import requests
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Entry as DbEntry, KhojUser, NotionConfig
|
||||
|
||||
@@ -50,7 +50,7 @@ class NotionBlockType(Enum):
|
||||
CALLOUT = "callout"
|
||||
|
||||
|
||||
class NotionToJsonl(TextEmbeddings):
|
||||
class NotionToJsonl(TextEntries):
|
||||
def __init__(self, config: NotionConfig):
|
||||
super().__init__(config)
|
||||
self.config = NotionContentConfig(
|
||||
|
||||
@@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.org_mode import orgnode
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils import state
|
||||
@@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OrgToJsonl(TextEmbeddings):
|
||||
class OrgToJsonl(TextEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import base64
|
||||
from langchain.document_loaders import PyMuPDFLoader
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
@@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfToJsonl(TextEmbeddings):
|
||||
class PdfToJsonl(TextEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
@@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PlaintextToJsonl(TextEmbeddings):
|
||||
class PlaintextToJsonl(TextEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ from database.adapters import EntryAdapters
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextEmbeddings(ABC):
|
||||
class TextEntries(ABC):
|
||||
def __init__(self, config: Any = None):
|
||||
self.embeddings_model = EmbeddingsModel()
|
||||
self.config = config
|
||||
@@ -85,10 +85,10 @@ class TextEmbeddings(ABC):
|
||||
):
|
||||
with timer("Construct current entry hashes", logger):
|
||||
hashes_by_file = dict[str, set[str]]()
|
||||
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
|
||||
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
|
||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
||||
hashes_by_file.setdefault(entry.file, set()).add(TextEmbeddings.hash_func(key)(entry))
|
||||
hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry))
|
||||
|
||||
num_deleted_embeddings = 0
|
||||
with timer("Preparing dataset for regeneration", logger):
|
||||
@@ -180,11 +180,11 @@ class TextEmbeddings(ABC):
|
||||
):
|
||||
# Hash all current and previous entries to identify new entries
|
||||
with timer("Hash previous, current entries", logger):
|
||||
current_entry_hashes = list(map(TextEmbeddings.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextEmbeddings.hash_func(key), previous_entries))
|
||||
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries))
|
||||
if deletion_filenames is not None:
|
||||
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
||||
deletion_entry_hashes = list(map(TextEmbeddings.hash_func(key), deletion_entries))
|
||||
deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries))
|
||||
else:
|
||||
deletion_entry_hashes = []
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder
|
||||
from khoj.utils.state import SearchType
|
||||
from khoj.utils.rawconfig import SearchResponse, Entry
|
||||
from khoj.utils.jsonl import load_jsonl
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from database.adapters import EntryAdapters
|
||||
from database.models import KhojUser, Entry as DbEntry
|
||||
|
||||
@@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
|
||||
|
||||
|
||||
def setup(
|
||||
text_to_jsonl: Type[TextEmbeddings],
|
||||
text_to_jsonl: Type[TextEntries],
|
||||
files: dict[str, str],
|
||||
regenerate: bool,
|
||||
full_corpus: bool = True,
|
||||
|
||||
@@ -4,7 +4,7 @@ import os
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextEmbeddings
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.utils.helpers import is_none_or_empty
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils.fs_syncer import get_org_files
|
||||
@@ -63,7 +63,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||
|
||||
# Split each entry from specified Org files by max words
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
TextEmbeddings.split_entries_by_max_tokens(
|
||||
TextEntries.split_entries_by_max_tokens(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||
)
|
||||
)
|
||||
@@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():
|
||||
|
||||
# Act
|
||||
# Split entry by max words and drop words larger than max word length
|
||||
processed_entry = TextEmbeddings.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||
processed_entry = TextEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||
|
||||
# Assert
|
||||
# "Heading" dropped from compiled version because its over the set max word limit
|
||||
|
||||
Reference in New Issue
Block a user