diff --git a/README.md b/README.md index 84da8a5f..5e8feb45 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ - **General** - **Natural**: Advanced natural language understanding using Transformer based ML Models - **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models - - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos + - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files, Github repositories, and Photos - **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/) ## Demos @@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0 - Install Khoj via `pip` and start Khoj backend in non-gui mode - Install Khoj plugin via Community Plugins settings pane on Obsidian app - Check the new Khoj plugin settings -- Let Khoj backend index the markdown, pdf files in the current Vault +- Let Khoj backend index the markdown, pdf, Github markdown files in the current Vault - Open Khoj plugin on Obsidian via Search button on Left Pane - Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/) - Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin) @@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj ##### 2. Configure -- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories +- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf, Github repositories, and beancount directories - **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml) ##### 3. Run diff --git a/pyproject.toml b/pyproject.toml index cf77ea79..db152d29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ dependencies = [ "aiohttp == 3.8.4", "langchain >= 0.0.187", "pypdf >= 3.9.0", + "llama-hub==0.0.3", ] dynamic = ["version"] diff --git a/src/khoj/configure.py b/src/khoj/configure.py index ae49678b..bf2de2e2 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -16,6 +16,7 @@ from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl +from khoj.processor.github.github_to_jsonl import GithubToJsonl from khoj.search_type import image_search, text_search from khoj.utils import constants, state from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel @@ -153,6 +154,20 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.image, search_config=config.search_type.image, regenerate=regenerate ) + if (t == state.SearchType.Github or t == None) and config.content_type.github: + logger.info("🐙 Setting up search for github") + # Extract Entries, Generate Github Embeddings + try: + model.github_search = text_search.setup( + GithubToJsonl, + config.content_type.github, + search_config=config.search_type.asymmetric, + regenerate=regenerate, + filters=[DateFilter(), WordFilter(), FileFilter()], + ) + except Exception as e: + logger.error(f"Failed to setup github search: {e}") + # Initialize External Plugin Search if (t == None or t in state.SearchType) and config.content_type.plugins: logger.info("🔌 Setting up search for plugins") diff --git a/src/khoj/interface/desktop/labelled_text_field.py b/src/khoj/interface/desktop/labelled_text_field.py index 4032c2c0..a897ee48 100644 --- a/src/khoj/interface/desktop/labelled_text_field.py +++ b/src/khoj/interface/desktop/labelled_text_field.py @@ -3,14 +3,18 @@ from PyQt6 import QtWidgets # Internal Packages from khoj.utils.config import ProcessorType +from khoj.utils.config import SearchType class LabelledTextField(QtWidgets.QWidget): - def __init__(self, title, processor_type: ProcessorType = None, default_value: str = None): + def __init__( + self, title, search_type: SearchType = None, processor_type: ProcessorType = None, default_value: str = None + ): QtWidgets.QWidget.__init__(self) layout = QtWidgets.QHBoxLayout() self.setLayout(layout) self.processor_type = processor_type + self.search_type = search_type self.label = QtWidgets.QLabel() self.label.setText(title) diff --git a/src/khoj/interface/desktop/main_window.py b/src/khoj/interface/desktop/main_window.py index 4237ac17..6fc061bd 100644 --- a/src/khoj/interface/desktop/main_window.py +++ b/src/khoj/interface/desktop/main_window.py @@ -62,7 +62,6 @@ class MainWindow(QtWidgets.QMainWindow): search_type, None ) or self.get_default_config(search_type=search_type) self.search_settings_panels += [self.add_settings_panel(current_content_config, search_type)] - # Add Conversation Processor Panel to Configure Screen self.processor_settings_panels = [] conversation_type = ProcessorType.Conversation @@ -88,6 +87,8 @@ class MainWindow(QtWidgets.QMainWindow): if search_type == SearchType.Image: current_content_files = current_content_config.get("input-directories", []) file_input_text = f"{search_type.name} Folders" + elif search_type == SearchType.Github: + return self.add_github_settings_panel(current_content_config, SearchType.Github) else: current_content_files = current_content_config.get("input-files", []) file_input_text = f"{search_type.name} Files" @@ -111,6 +112,47 @@ class MainWindow(QtWidgets.QMainWindow): return search_type_settings + def add_github_settings_panel(self, current_content_config: dict, search_type: SearchType): + search_type_settings = QtWidgets.QWidget() + search_type_layout = QtWidgets.QVBoxLayout(search_type_settings) + enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type) + # Add labelled text input field + input_fields = [] + + pat_token = current_content_config.get("pat-token", None) + input_field = LabelledTextField("pat-token", search_type=search_type, default_value=pat_token) + search_type_layout.addWidget(input_field) + input_fields += [input_field] + + repo_name = current_content_config.get("repo-name", None) + input_field = LabelledTextField("repo-name", search_type=search_type, default_value=repo_name) + search_type_layout.addWidget(input_field) + input_fields += [input_field] + + repo_owner = current_content_config.get("repo-owner", None) + input_field = LabelledTextField("repo-owner", search_type=search_type, default_value=repo_owner) + search_type_layout.addWidget(input_field) + input_fields += [input_field] + + repo_branch = current_content_config.get("repo-branch", None) + input_field = LabelledTextField("repo-branch", search_type=search_type, default_value=repo_branch) + search_type_layout.addWidget(input_field) + input_fields += [input_field] + + # Set enabled/disabled based on checkbox state + enable_search_type.setChecked(bool(repo_name or repo_owner or repo_branch or pat_token)) + for input_field in input_fields: + input_field.setEnabled(enable_search_type.isChecked()) + enable_search_type.stateChanged.connect(lambda _: [input_field.setEnabled(enable_search_type.isChecked()) for input_field in input_fields]) # type: ignore[attr-defined] + + # Add setting widgets for given search type to panel + search_type_layout.addWidget(enable_search_type) + for input_field in input_fields: + search_type_layout.addWidget(input_field) + self.wlayout.addWidget(search_type_settings) + + return search_type_settings + def add_processor_panel(self, current_conversation_config: dict, processor_type: ProcessorType): "Add Conversation Processor Panel" # Get current settings from config for given processor type @@ -185,7 +227,7 @@ class MainWindow(QtWidgets.QMainWindow): "Update config with search settings from UI" for settings_panel in self.search_settings_panels: for child in settings_panel.children(): - if not isinstance(child, (SearchCheckBox, FileBrowser)): + if not isinstance(child, (SearchCheckBox, FileBrowser, LabelledTextField)): continue if isinstance(child, SearchCheckBox): # Search Type Disabled @@ -207,6 +249,10 @@ class MainWindow(QtWidgets.QMainWindow): self.new_config["content-type"][child.search_type.value]["input-files"] = ( child.getPaths() if child.getPaths() != [] else None ) + elif isinstance(child, LabelledTextField): + self.new_config["content-type"][child.search_type.value][ + child.label.text() + ] = child.input_field.toPlainText() def update_processor_settings(self): "Update config with conversation settings from UI" diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 84fbd7f5..388c0207 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -66,6 +66,8 @@ return render_ledger(query, data); } else if (type === "pdf") { return render_pdf(query, data); + } else if (type == "github") { + return render_markdown(query, data); } else { return `
${item.entry}
`).join("\n") @@ -296,7 +298,7 @@ text-align: left; white-space: pre-line; } - #results-markdown { + #results-markdown, #results-github { text-align: left; } #results-music, diff --git a/src/khoj/processor/github/__init__.py b/src/khoj/processor/github/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py new file mode 100644 index 00000000..6886d9a9 --- /dev/null +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -0,0 +1,89 @@ +import logging +from llama_index import download_loader +from khoj.utils.helpers import timer +from khoj.utils.rawconfig import GithubContentConfig +from llama_hub.github_repo import GithubRepositoryReader, GithubClient +from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl +from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils import state + +logger = logging.getLogger(__name__) + + +class GithubToJsonl: + def __init__(self, config: GithubContentConfig): + self.config = config + download_loader("GithubRepositoryReader") + + def process(self, previous_entries=None): + try: + self.initialize() + except Exception as e: + logger.error( + f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}" + ) + raise e + + with timer("Download github repo", logger): + try: + docs = self.get_markdown_files() + except Exception as e: + logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}") + raise e + + logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}") + + with timer("Extract markdown entries from github repo", logger): + current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( + *GithubToJsonl.extract_markdown_entries(docs) + ) + + with timer("Split entries by max token size supported by model", logger): + current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256) + + # Identify, mark and merge any new entries with previous entries + with timer("Identify new or updated entries", logger): + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) + + with timer("Write markdown entries to JSONL file", logger): + # Process Each Entry from All Notes Files + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) + + # Compress JSONL formatted Data + if self.config.compressed_jsonl.suffix == ".gz": + compress_jsonl_data(jsonl_data, self.config.compressed_jsonl) + elif self.config.compressed_jsonl.suffix == ".jsonl": + dump_jsonl(jsonl_data, self.config.compressed_jsonl) + + return entries_with_ids + + def initialize(self): + logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}") + github_client = GithubClient(self.config.pat_token) + self.loader = GithubRepositoryReader( + github_client, + owner=self.config.repo_owner, + repo=self.config.repo_name, + filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE), + verbose=state.verbose > 1, + ) + + def get_markdown_files(self): + return self.loader.load_data(branch=self.config.repo_branch) + + @staticmethod + def extract_markdown_entries(markdown_files): + entries = [] + entry_to_file_map = [] + for doc in markdown_files: + entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map + ) + return entries, dict(entry_to_file_map) diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py index 83c82374..f743d5d5 100644 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py @@ -41,7 +41,7 @@ class JsonlToJsonl(TextToJsonl): if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: - entries_with_ids = self.mark_entries_for_update( + entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", diff --git a/src/khoj/processor/ledger/beancount_to_jsonl.py b/src/khoj/processor/ledger/beancount_to_jsonl.py index 49c43301..347012a3 100644 --- a/src/khoj/processor/ledger/beancount_to_jsonl.py +++ b/src/khoj/processor/ledger/beancount_to_jsonl.py @@ -48,7 +48,7 @@ class BeancountToJsonl(TextToJsonl): if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: - entries_with_ids = self.mark_entries_for_update( + entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", logger=logger ) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 0179e05e..efb508ad 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -49,7 +49,7 @@ class MarkdownToJsonl(TextToJsonl): if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: - entries_with_ids = self.mark_entries_for_update( + entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", logger=logger ) @@ -101,27 +101,37 @@ class MarkdownToJsonl(TextToJsonl): "Extract entries by heading from specified Markdown files" # Regex to extract Markdown Entries by Heading - markdown_heading_regex = r"^#" entries = [] entry_to_file_map = [] for markdown_file in markdown_files: with open(markdown_file, "r", encoding="utf8") as f: markdown_content = f.read() - markdown_entries_per_file = [] - any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) - for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): - # Add heading level as the regex split removed it from entries with headings - prefix = "#" if entry.startswith("#") else "# " if any_headings else "" - stripped_entry = entry.strip(empty_escape_sequences) - if stripped_entry != "": - markdown_entries_per_file.append(f"{prefix}{stripped_entry}") - - entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) - entries.extend(markdown_entries_per_file) + entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + markdown_content, markdown_file, entries, entry_to_file_map + ) return entries, dict(entry_to_file_map) + @staticmethod + def process_single_markdown_file( + markdown_content: str, markdown_file: Path, entries: List, entry_to_file_map: List + ): + markdown_heading_regex = r"^#" + + markdown_entries_per_file = [] + any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE) + for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): + # Add heading level as the regex split removed it from entries with headings + prefix = "#" if entry.startswith("#") else "# " if any_headings else "" + stripped_entry = entry.strip(empty_escape_sequences) + if stripped_entry != "": + markdown_entries_per_file.append(f"{prefix}{stripped_entry}") + + entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file)) + entries.extend(markdown_entries_per_file) + return entries, entry_to_file_map + @staticmethod def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: "Convert each Markdown entries into a dictionary" diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index e5ec7cc6..96f2238e 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -50,7 +50,7 @@ class OrgToJsonl(TextToJsonl): if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: - entries_with_ids = self.mark_entries_for_update( + entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", logger=logger ) diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index 27c03d55..d8092cc8 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -48,7 +48,7 @@ class PdfToJsonl(TextToJsonl): if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: - entries_with_ids = self.mark_entries_for_update( + entries_with_ids = TextToJsonl.mark_entries_for_update( current_entries, previous_entries, key="compiled", logger=logger ) diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py index 3dd0d1b5..d85d6998 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_jsonl.py @@ -60,8 +60,9 @@ class TextToJsonl(ABC): return chunked_entries + @staticmethod def mark_entries_for_update( - self, current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None + current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None ) -> List[Tuple[int, Entry]]: # Hash all current and previous entries to identify new entries with timer("Hash previous, current entries", logger): diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index bf165a1e..1f98496c 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -121,6 +121,17 @@ def search( with timer("Collating results took", logger): results = text_search.collate_results(hits, entries, results_count) + elif (t == SearchType.Github or t == None) and state.model.github_search: + # query github embeddings + with timer("Query took", logger): + hits, entries = text_search.query( + user_query, state.model.github_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe + ) + + # collate and return results + with timer("Collating results took", logger): + results = text_search.collate_results(hits, entries, results_count) + elif (t == SearchType.Ledger or t == None) and state.model.ledger_search: # query transactions with timer("Query took", logger): diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 7b590d13..cc461855 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -23,6 +23,7 @@ class SearchType(str, Enum): Markdown = "markdown" Image = "image" Pdf = "pdf" + Github = "github" class ProcessorType(str, Enum): @@ -64,6 +65,7 @@ class SearchModels: markdown_search: TextSearchModel = None pdf_search: TextSearchModel = None image_search: ImageSearchModel = None + github_search: TextSearchModel = None plugin_search: Dict[str, TextSearchModel] = None diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index 87eb07ac..df5494a8 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -47,6 +47,14 @@ default_config = { "compressed-jsonl": "~/.khoj/content/music/music.jsonl.gz", "embeddings-file": "~/.khoj/content/music/music_embeddings.pt", }, + "github": { + "pat-token": None, + "repo-name": None, + "repo-owner": None, + "repo-branch": "master", + "compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz", + "embeddings-file": "~/.khoj/content/github/github_embeddings.pt", + }, }, "search-type": { "symmetric": { diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 72d82ce9..371918b6 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -32,6 +32,15 @@ class TextContentConfig(ConfigBase): return input_filter +class GithubContentConfig(ConfigBase): + pat_token: str + repo_name: str + repo_owner: str + repo_branch: Optional[str] = "master" + compressed_jsonl: Path + embeddings_file: Path + + class ImageContentConfig(ConfigBase): input_directories: Optional[List[Path]] input_filter: Optional[List[str]] @@ -57,6 +66,7 @@ class ContentConfig(ConfigBase): music: Optional[TextContentConfig] markdown: Optional[TextContentConfig] pdf: Optional[TextContentConfig] + github: Optional[GithubContentConfig] plugins: Optional[Dict[str, TextContentConfig]]