mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-06 05:39:12 +00:00
Rename Files, Classes from X_To_JSONL to more appropriate X_To_Entries
These content processors are converting content into entries in DB instead of entries in JSONL file
This commit is contained in:
@@ -10,16 +10,16 @@ import requests
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from database.models import Entry as DbEntry, GithubConfig, KhojUser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GithubToJsonl(TextEntries):
|
||||
class GithubToEntries(TextToEntries):
|
||||
def __init__(self, config: GithubConfig):
|
||||
super().__init__(config)
|
||||
raw_repos = config.githubrepoconfig.all()
|
||||
@@ -77,24 +77,26 @@ class GithubToJsonl(TextEntries):
|
||||
current_entries = []
|
||||
|
||||
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||
*GithubToJsonl.extract_markdown_entries(markdown_files)
|
||||
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||
*GithubToEntries.extract_markdown_entries(markdown_files)
|
||||
)
|
||||
|
||||
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
||||
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))
|
||||
current_entries += OrgToEntries.convert_org_nodes_to_entries(
|
||||
*GithubToEntries.extract_org_entries(org_files)
|
||||
)
|
||||
|
||||
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
||||
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
||||
|
||||
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
||||
issue_entries = GithubToJsonl.convert_issues_to_entries(
|
||||
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
|
||||
issue_entries = GithubToEntries.convert_issues_to_entries(
|
||||
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
|
||||
)
|
||||
current_entries += issue_entries
|
||||
|
||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||
current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
return current_entries
|
||||
|
||||
@@ -280,7 +282,7 @@ class GithubToJsonl(TextEntries):
|
||||
entries = []
|
||||
entry_to_file_map = []
|
||||
for doc in markdown_files:
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||
doc["content"], doc["path"], entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
||||
@@ -291,7 +293,7 @@ class GithubToJsonl(TextEntries):
|
||||
entry_to_file_map = []
|
||||
|
||||
for doc in org_files:
|
||||
entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
|
||||
entries, entry_to_file_map = OrgToEntries.process_single_org_file(
|
||||
doc["content"], doc["path"], entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
||||
from typing import Tuple, List
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.utils.rawconfig import Entry
|
||||
@@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownToJsonl(TextEntries):
|
||||
class MarkdownToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEntries):
|
||||
|
||||
# Extract Entries from specified Markdown files
|
||||
with timer("Parse entries from Markdown files into dictionaries", logger):
|
||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||
*MarkdownToJsonl.extract_markdown_entries(files)
|
||||
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||
*MarkdownToEntries.extract_markdown_entries(files)
|
||||
)
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
@@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEntries):
|
||||
for markdown_file in markdown_files:
|
||||
try:
|
||||
markdown_content = markdown_files[markdown_file]
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||
markdown_content, markdown_file, entries, entry_to_file_map
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -8,7 +8,7 @@ import requests
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Entry as DbEntry, KhojUser, NotionConfig
|
||||
|
||||
@@ -50,7 +50,7 @@ class NotionBlockType(Enum):
|
||||
CALLOUT = "callout"
|
||||
|
||||
|
||||
class NotionToJsonl(TextEntries):
|
||||
class NotionToEntries(TextToEntries):
|
||||
def __init__(self, config: NotionConfig):
|
||||
super().__init__(config)
|
||||
self.config = NotionContentConfig(
|
||||
@@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.org_mode import orgnode
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from khoj.utils import state
|
||||
@@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OrgToJsonl(TextEntries):
|
||||
class OrgToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -8,7 +8,7 @@ import base64
|
||||
from langchain.document_loaders import PyMuPDFLoader
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
@@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfToJsonl(TextEntries):
|
||||
class PdfToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -35,7 +35,7 @@ class PdfToJsonl(TextEntries):
|
||||
|
||||
# Extract Entries from specified Pdf files
|
||||
with timer("Parse entries from PDF files into dictionaries", logger):
|
||||
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files))
|
||||
current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files))
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
@@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import Entry
|
||||
from database.models import Entry as DbEntry, KhojUser
|
||||
@@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PlaintextToJsonl(TextEntries):
|
||||
class PlaintextToEntries(TextToEntries):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEntries):
|
||||
try:
|
||||
plaintext_content = files[file]
|
||||
if file.endswith(("html", "htm", "xml")):
|
||||
plaintext_content = PlaintextToJsonl.extract_html_content(
|
||||
plaintext_content = PlaintextToEntries.extract_html_content(
|
||||
plaintext_content, file.split(".")[-1]
|
||||
)
|
||||
files[file] = plaintext_content
|
||||
@@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEntries):
|
||||
|
||||
# Extract Entries from specified plaintext files
|
||||
with timer("Parse entries from plaintext files", logger):
|
||||
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
|
||||
current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files)
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
@@ -19,7 +19,7 @@ from database.adapters import EntryAdapters
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextEntries(ABC):
|
||||
class TextToEntries(ABC):
|
||||
def __init__(self, config: Any = None):
|
||||
self.embeddings_model = EmbeddingsModel()
|
||||
self.config = config
|
||||
@@ -85,10 +85,10 @@ class TextEntries(ABC):
|
||||
):
|
||||
with timer("Construct current entry hashes", logger):
|
||||
hashes_by_file = dict[str, set[str]]()
|
||||
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
|
||||
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
||||
hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry))
|
||||
hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
|
||||
|
||||
num_deleted_embeddings = 0
|
||||
with timer("Preparing dataset for regeneration", logger):
|
||||
@@ -180,11 +180,11 @@ class TextEntries(ABC):
|
||||
):
|
||||
# Hash all current and previous entries to identify new entries
|
||||
with timer("Hash previous, current entries", logger):
|
||||
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries))
|
||||
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||
previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
|
||||
if deletion_filenames is not None:
|
||||
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
||||
deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries))
|
||||
deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
|
||||
else:
|
||||
deletion_entry_hashes = []
|
||||
|
||||
@@ -10,12 +10,12 @@ from starlette.authentication import requires
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import state, constants
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
||||
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||
from khoj.processor.notion.notion_to_entries import NotionToEntries
|
||||
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||
from khoj.search_type import text_search, image_search
|
||||
from khoj.routers.helpers import update_telemetry_state
|
||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||
@@ -201,7 +201,7 @@ def configure_content(
|
||||
logger.info("🦄 Setting up search for orgmode notes")
|
||||
# Extract Entries, Generate Notes Embeddings
|
||||
text_search.setup(
|
||||
OrgToJsonl,
|
||||
OrgToEntries,
|
||||
files.get("org"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
@@ -216,7 +216,7 @@ def configure_content(
|
||||
logger.info("💎 Setting up search for markdown notes")
|
||||
# Extract Entries, Generate Markdown Embeddings
|
||||
text_search.setup(
|
||||
MarkdownToJsonl,
|
||||
MarkdownToEntries,
|
||||
files.get("markdown"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
@@ -232,7 +232,7 @@ def configure_content(
|
||||
logger.info("🖨️ Setting up search for pdf")
|
||||
# Extract Entries, Generate PDF Embeddings
|
||||
text_search.setup(
|
||||
PdfToJsonl,
|
||||
PdfToEntries,
|
||||
files.get("pdf"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
@@ -248,7 +248,7 @@ def configure_content(
|
||||
logger.info("📄 Setting up search for plaintext")
|
||||
# Extract Entries, Generate Plaintext Embeddings
|
||||
text_search.setup(
|
||||
PlaintextToJsonl,
|
||||
PlaintextToEntries,
|
||||
files.get("plaintext"),
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
@@ -281,7 +281,7 @@ def configure_content(
|
||||
logger.info("🐙 Setting up search for github")
|
||||
# Extract Entries, Generate Github Embeddings
|
||||
text_search.setup(
|
||||
GithubToJsonl,
|
||||
GithubToEntries,
|
||||
None,
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
@@ -298,7 +298,7 @@ def configure_content(
|
||||
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
|
||||
logger.info("🔌 Setting up search for notion")
|
||||
text_search.setup(
|
||||
NotionToJsonl,
|
||||
NotionToEntries,
|
||||
None,
|
||||
regenerate=regenerate,
|
||||
full_corpus=full_corpus,
|
||||
|
||||
@@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder
|
||||
from khoj.utils.state import SearchType
|
||||
from khoj.utils.rawconfig import SearchResponse, Entry
|
||||
from khoj.utils.jsonl import load_jsonl
|
||||
from khoj.processor.text_to_jsonl import TextEntries
|
||||
from khoj.processor.text_to_entries import TextToEntries
|
||||
from database.adapters import EntryAdapters
|
||||
from database.models import KhojUser, Entry as DbEntry
|
||||
|
||||
@@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
|
||||
|
||||
|
||||
def setup(
|
||||
text_to_jsonl: Type[TextEntries],
|
||||
text_to_entries: Type[TextToEntries],
|
||||
files: dict[str, str],
|
||||
regenerate: bool,
|
||||
full_corpus: bool = True,
|
||||
@@ -196,11 +196,11 @@ def setup(
|
||||
config=None,
|
||||
) -> None:
|
||||
if config:
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process(
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
|
||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||
)
|
||||
else:
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process(
|
||||
num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
|
||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user