From a6cd96a6a91f5652c08a50036ecc787662479eaa Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Tue, 13 Jun 2023 14:40:06 -0700
Subject: [PATCH 01/19] Add a Github plugin which can be used to read from a
 Github repository

---
 README.md                                     |  6 +-
 pyproject.toml                                |  1 +
 src/khoj/configure.py                         | 15 ++++
 .../interface/desktop/labelled_text_field.py  |  6 +-
 src/khoj/interface/desktop/main_window.py     | 50 ++++++++++-
 src/khoj/interface/web/index.html             |  4 +-
 src/khoj/processor/github/__init__.py         |  0
 src/khoj/processor/github/github_to_jsonl.py  | 89 +++++++++++++++++++
 src/khoj/processor/jsonl/jsonl_to_jsonl.py    |  2 +-
 .../processor/ledger/beancount_to_jsonl.py    |  2 +-
 .../processor/markdown/markdown_to_jsonl.py   | 36 +++++---
 src/khoj/processor/org_mode/org_to_jsonl.py   |  2 +-
 src/khoj/processor/pdf/pdf_to_jsonl.py        |  2 +-
 src/khoj/processor/text_to_jsonl.py           |  3 +-
 src/khoj/routers/api.py                       | 11 +++
 src/khoj/utils/config.py                      |  2 +
 src/khoj/utils/constants.py                   |  8 ++
 src/khoj/utils/rawconfig.py                   | 10 +++
 18 files changed, 224 insertions(+), 25 deletions(-)
 create mode 100644 src/khoj/processor/github/__init__.py
 create mode 100644 src/khoj/processor/github/github_to_jsonl.py

diff --git a/README.md b/README.md
index 84da8a5f..5e8feb45 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@
 - **General**
   - **Natural**: Advanced natural language understanding using Transformer based ML Models
   - **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
-  - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos
+  - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files, Github repositories, and Photos
   - **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
 
 ## Demos
@@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
 - Install Khoj via `pip` and start Khoj backend in non-gui mode
 - Install Khoj plugin via Community Plugins settings pane on Obsidian app
 - Check the new Khoj plugin settings
-- Let Khoj backend index the markdown, pdf files in the current Vault
+- Let Khoj backend index the markdown, pdf, Github markdown files in the current Vault
 - Open Khoj plugin on Obsidian via Search button on Left Pane
 - Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
 - Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
@@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj
 
 ##### 2. Configure
 
-- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories
+- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf, Github repositories, and beancount directories
 - **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)
 
 ##### 3. Run
diff --git a/pyproject.toml b/pyproject.toml
index cf77ea79..db152d29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ dependencies = [
     "aiohttp == 3.8.4",
     "langchain >= 0.0.187",
     "pypdf >= 3.9.0",
+    "llama-hub==0.0.3",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index ae49678b..bf2de2e2 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -16,6 +16,7 @@ from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
+from khoj.processor.github.github_to_jsonl import GithubToJsonl
 from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@@ -153,6 +154,20 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
             config.content_type.image, search_config=config.search_type.image, regenerate=regenerate
         )
 
+    if (t == state.SearchType.Github or t == None) and config.content_type.github:
+        logger.info("🐙 Setting up search for github")
+        # Extract Entries, Generate Github Embeddings
+        try:
+            model.github_search = text_search.setup(
+                GithubToJsonl,
+                config.content_type.github,
+                search_config=config.search_type.asymmetric,
+                regenerate=regenerate,
+                filters=[DateFilter(), WordFilter(), FileFilter()],
+            )
+        except Exception as e:
+            logger.error(f"Failed to setup github search: {e}")
+
     # Initialize External Plugin Search
     if (t == None or t in state.SearchType) and config.content_type.plugins:
         logger.info("🔌 Setting up search for plugins")
diff --git a/src/khoj/interface/desktop/labelled_text_field.py b/src/khoj/interface/desktop/labelled_text_field.py
index 4032c2c0..a897ee48 100644
--- a/src/khoj/interface/desktop/labelled_text_field.py
+++ b/src/khoj/interface/desktop/labelled_text_field.py
@@ -3,14 +3,18 @@ from PyQt6 import QtWidgets
 
 # Internal Packages
 from khoj.utils.config import ProcessorType
+from khoj.utils.config import SearchType
 
 
 class LabelledTextField(QtWidgets.QWidget):
-    def __init__(self, title, processor_type: ProcessorType = None, default_value: str = None):
+    def __init__(
+        self, title, search_type: SearchType = None, processor_type: ProcessorType = None, default_value: str = None
+    ):
         QtWidgets.QWidget.__init__(self)
         layout = QtWidgets.QHBoxLayout()
         self.setLayout(layout)
         self.processor_type = processor_type
+        self.search_type = search_type
 
         self.label = QtWidgets.QLabel()
         self.label.setText(title)
diff --git a/src/khoj/interface/desktop/main_window.py b/src/khoj/interface/desktop/main_window.py
index 4237ac17..6fc061bd 100644
--- a/src/khoj/interface/desktop/main_window.py
+++ b/src/khoj/interface/desktop/main_window.py
@@ -62,7 +62,6 @@ class MainWindow(QtWidgets.QMainWindow):
                 search_type, None
             ) or self.get_default_config(search_type=search_type)
             self.search_settings_panels += [self.add_settings_panel(current_content_config, search_type)]
-
         # Add Conversation Processor Panel to Configure Screen
         self.processor_settings_panels = []
         conversation_type = ProcessorType.Conversation
@@ -88,6 +87,8 @@ class MainWindow(QtWidgets.QMainWindow):
         if search_type == SearchType.Image:
             current_content_files = current_content_config.get("input-directories", [])
             file_input_text = f"{search_type.name} Folders"
+        elif search_type == SearchType.Github:
+            return self.add_github_settings_panel(current_content_config, SearchType.Github)
         else:
             current_content_files = current_content_config.get("input-files", [])
             file_input_text = f"{search_type.name} Files"
@@ -111,6 +112,47 @@ class MainWindow(QtWidgets.QMainWindow):
 
         return search_type_settings
 
+    def add_github_settings_panel(self, current_content_config: dict, search_type: SearchType):
+        search_type_settings = QtWidgets.QWidget()
+        search_type_layout = QtWidgets.QVBoxLayout(search_type_settings)
+        enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type)
+        # Add labelled text input field
+        input_fields = []
+
+        pat_token = current_content_config.get("pat-token", None)
+        input_field = LabelledTextField("pat-token", search_type=search_type, default_value=pat_token)
+        search_type_layout.addWidget(input_field)
+        input_fields += [input_field]
+
+        repo_name = current_content_config.get("repo-name", None)
+        input_field = LabelledTextField("repo-name", search_type=search_type, default_value=repo_name)
+        search_type_layout.addWidget(input_field)
+        input_fields += [input_field]
+
+        repo_owner = current_content_config.get("repo-owner", None)
+        input_field = LabelledTextField("repo-owner", search_type=search_type, default_value=repo_owner)
+        search_type_layout.addWidget(input_field)
+        input_fields += [input_field]
+
+        repo_branch = current_content_config.get("repo-branch", None)
+        input_field = LabelledTextField("repo-branch", search_type=search_type, default_value=repo_branch)
+        search_type_layout.addWidget(input_field)
+        input_fields += [input_field]
+
+        # Set enabled/disabled based on checkbox state
+        enable_search_type.setChecked(bool(repo_name or repo_owner or repo_branch or pat_token))
+        for input_field in input_fields:
+            input_field.setEnabled(enable_search_type.isChecked())
+        enable_search_type.stateChanged.connect(lambda _: [input_field.setEnabled(enable_search_type.isChecked()) for input_field in input_fields])  # type: ignore[attr-defined]
+
+        # Add setting widgets for given search type to panel
+        search_type_layout.addWidget(enable_search_type)
+        for input_field in input_fields:
+            search_type_layout.addWidget(input_field)
+        self.wlayout.addWidget(search_type_settings)
+
+        return search_type_settings
+
     def add_processor_panel(self, current_conversation_config: dict, processor_type: ProcessorType):
         "Add Conversation Processor Panel"
         # Get current settings from config for given processor type
@@ -185,7 +227,7 @@ class MainWindow(QtWidgets.QMainWindow):
         "Update config with search settings from UI"
         for settings_panel in self.search_settings_panels:
             for child in settings_panel.children():
-                if not isinstance(child, (SearchCheckBox, FileBrowser)):
+                if not isinstance(child, (SearchCheckBox, FileBrowser, LabelledTextField)):
                     continue
                 if isinstance(child, SearchCheckBox):
                     # Search Type Disabled
@@ -207,6 +249,10 @@ class MainWindow(QtWidgets.QMainWindow):
                         self.new_config["content-type"][child.search_type.value]["input-files"] = (
                             child.getPaths() if child.getPaths() != [] else None
                         )
+                elif isinstance(child, LabelledTextField):
+                    self.new_config["content-type"][child.search_type.value][
+                        child.label.text()
+                    ] = child.input_field.toPlainText()
 
     def update_processor_settings(self):
         "Update config with conversation settings from UI"
diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index 84fbd7f5..388c0207 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -66,6 +66,8 @@
                 return render_ledger(query, data);
             } else if (type === "pdf") {
                 return render_pdf(query, data);
+            } else if (type == "github") {
+                return render_markdown(query, data);
             } else {
                 return `<div id="results-plugin">`
                     + data.map((item) => `<p>${item.entry}</p>`).join("\n")
@@ -296,7 +298,7 @@
             text-align: left;
             white-space: pre-line;
         }
-         #results-markdown {
+         #results-markdown, #results-github {
             text-align: left;
         }
         #results-music,
diff --git a/src/khoj/processor/github/__init__.py b/src/khoj/processor/github/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
new file mode 100644
index 00000000..6886d9a9
--- /dev/null
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -0,0 +1,89 @@
+import logging
+from llama_index import download_loader
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import GithubContentConfig
+from llama_hub.github_repo import GithubRepositoryReader, GithubClient
+from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
+from khoj.processor.text_to_jsonl import TextToJsonl
+from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils import state
+
+logger = logging.getLogger(__name__)
+
+
+class GithubToJsonl:
+    def __init__(self, config: GithubContentConfig):
+        self.config = config
+        download_loader("GithubRepositoryReader")
+
+    def process(self, previous_entries=None):
+        try:
+            self.initialize()
+        except Exception as e:
+            logger.error(
+                f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
+            )
+            raise e
+
+        with timer("Download github repo", logger):
+            try:
+                docs = self.get_markdown_files()
+            except Exception as e:
+                logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}")
+                raise e
+
+        logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}")
+
+        with timer("Extract markdown entries from github repo", logger):
+            current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
+                *GithubToJsonl.extract_markdown_entries(docs)
+            )
+
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
+
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            if not previous_entries:
+                entries_with_ids = list(enumerate(current_entries))
+            else:
+                entries_with_ids = TextToJsonl.mark_entries_for_update(
+                    current_entries, previous_entries, key="compiled", logger=logger
+                )
+
+        with timer("Write markdown entries to JSONL file", logger):
+            # Process Each Entry from All Notes Files
+            entries = list(map(lambda entry: entry[1], entries_with_ids))
+            jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
+
+            # Compress JSONL formatted Data
+            if self.config.compressed_jsonl.suffix == ".gz":
+                compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
+            elif self.config.compressed_jsonl.suffix == ".jsonl":
+                dump_jsonl(jsonl_data, self.config.compressed_jsonl)
+
+        return entries_with_ids
+
+    def initialize(self):
+        logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
+        github_client = GithubClient(self.config.pat_token)
+        self.loader = GithubRepositoryReader(
+            github_client,
+            owner=self.config.repo_owner,
+            repo=self.config.repo_name,
+            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
+            verbose=state.verbose > 1,
+        )
+
+    def get_markdown_files(self):
+        return self.loader.load_data(branch=self.config.repo_branch)
+
+    @staticmethod
+    def extract_markdown_entries(markdown_files):
+        entries = []
+        entry_to_file_map = []
+        for doc in markdown_files:
+            entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
+                doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
+            )
+        return entries, dict(entry_to_file_map)
diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py
index 83c82374..f743d5d5 100644
--- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py
+++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py
@@ -41,7 +41,7 @@ class JsonlToJsonl(TextToJsonl):
             if not previous_entries:
                 entries_with_ids = list(enumerate(current_entries))
             else:
-                entries_with_ids = self.mark_entries_for_update(
+                entries_with_ids = TextToJsonl.mark_entries_for_update(
                     current_entries,
                     previous_entries,
                     key="compiled",
diff --git a/src/khoj/processor/ledger/beancount_to_jsonl.py b/src/khoj/processor/ledger/beancount_to_jsonl.py
index 49c43301..347012a3 100644
--- a/src/khoj/processor/ledger/beancount_to_jsonl.py
+++ b/src/khoj/processor/ledger/beancount_to_jsonl.py
@@ -48,7 +48,7 @@ class BeancountToJsonl(TextToJsonl):
             if not previous_entries:
                 entries_with_ids = list(enumerate(current_entries))
             else:
-                entries_with_ids = self.mark_entries_for_update(
+                entries_with_ids = TextToJsonl.mark_entries_for_update(
                     current_entries, previous_entries, key="compiled", logger=logger
                 )
 
diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py
index 0179e05e..efb508ad 100644
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@@ -49,7 +49,7 @@ class MarkdownToJsonl(TextToJsonl):
             if not previous_entries:
                 entries_with_ids = list(enumerate(current_entries))
             else:
-                entries_with_ids = self.mark_entries_for_update(
+                entries_with_ids = TextToJsonl.mark_entries_for_update(
                     current_entries, previous_entries, key="compiled", logger=logger
                 )
 
@@ -101,27 +101,37 @@ class MarkdownToJsonl(TextToJsonl):
         "Extract entries by heading from specified Markdown files"
 
         # Regex to extract Markdown Entries by Heading
-        markdown_heading_regex = r"^#"
 
         entries = []
         entry_to_file_map = []
         for markdown_file in markdown_files:
             with open(markdown_file, "r", encoding="utf8") as f:
                 markdown_content = f.read()
-                markdown_entries_per_file = []
-                any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
-                for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
-                    # Add heading level as the regex split removed it from entries with headings
-                    prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
-                    stripped_entry = entry.strip(empty_escape_sequences)
-                    if stripped_entry != "":
-                        markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
-
-                entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
-                entries.extend(markdown_entries_per_file)
+                entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
+                    markdown_content, markdown_file, entries, entry_to_file_map
+                )
 
         return entries, dict(entry_to_file_map)
 
+    @staticmethod
+    def process_single_markdown_file(
+        markdown_content: str, markdown_file: Path, entries: List, entry_to_file_map: List
+    ):
+        markdown_heading_regex = r"^#"
+
+        markdown_entries_per_file = []
+        any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
+        for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
+            # Add heading level as the regex split removed it from entries with headings
+            prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
+            stripped_entry = entry.strip(empty_escape_sequences)
+            if stripped_entry != "":
+                markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
+
+        entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
+        entries.extend(markdown_entries_per_file)
+        return entries, entry_to_file_map
+
     @staticmethod
     def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
         "Convert each Markdown entries into a dictionary"
diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py
index e5ec7cc6..96f2238e 100644
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@@ -50,7 +50,7 @@ class OrgToJsonl(TextToJsonl):
         if not previous_entries:
             entries_with_ids = list(enumerate(current_entries))
         else:
-            entries_with_ids = self.mark_entries_for_update(
+            entries_with_ids = TextToJsonl.mark_entries_for_update(
                 current_entries, previous_entries, key="compiled", logger=logger
             )
 
diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py
index 27c03d55..d8092cc8 100644
--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@@ -48,7 +48,7 @@ class PdfToJsonl(TextToJsonl):
             if not previous_entries:
                 entries_with_ids = list(enumerate(current_entries))
             else:
-                entries_with_ids = self.mark_entries_for_update(
+                entries_with_ids = TextToJsonl.mark_entries_for_update(
                     current_entries, previous_entries, key="compiled", logger=logger
                 )
 
diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index 3dd0d1b5..d85d6998 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -60,8 +60,9 @@ class TextToJsonl(ABC):
 
         return chunked_entries
 
+    @staticmethod
     def mark_entries_for_update(
-        self, current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None
+        current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None
     ) -> List[Tuple[int, Entry]]:
         # Hash all current and previous entries to identify new entries
         with timer("Hash previous, current entries", logger):
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index bf165a1e..1f98496c 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -121,6 +121,17 @@ def search(
         with timer("Collating results took", logger):
             results = text_search.collate_results(hits, entries, results_count)
 
+    elif (t == SearchType.Github or t == None) and state.model.github_search:
+        # query github embeddings
+        with timer("Query took", logger):
+            hits, entries = text_search.query(
+                user_query, state.model.github_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
+            )
+
+        # collate and return results
+        with timer("Collating results took", logger):
+            results = text_search.collate_results(hits, entries, results_count)
+
     elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
         # query transactions
         with timer("Query took", logger):
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 7b590d13..cc461855 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -23,6 +23,7 @@ class SearchType(str, Enum):
     Markdown = "markdown"
     Image = "image"
     Pdf = "pdf"
+    Github = "github"
 
 
 class ProcessorType(str, Enum):
@@ -64,6 +65,7 @@ class SearchModels:
     markdown_search: TextSearchModel = None
     pdf_search: TextSearchModel = None
     image_search: ImageSearchModel = None
+    github_search: TextSearchModel = None
     plugin_search: Dict[str, TextSearchModel] = None
 
 
diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index 87eb07ac..df5494a8 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -47,6 +47,14 @@ default_config = {
             "compressed-jsonl": "~/.khoj/content/music/music.jsonl.gz",
             "embeddings-file": "~/.khoj/content/music/music_embeddings.pt",
         },
+        "github": {
+            "pat-token": None,
+            "repo-name": None,
+            "repo-owner": None,
+            "repo-branch": "master",
+            "compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/github/github_embeddings.pt",
+        },
     },
     "search-type": {
         "symmetric": {
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 72d82ce9..371918b6 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -32,6 +32,15 @@ class TextContentConfig(ConfigBase):
         return input_filter
 
 
+class GithubContentConfig(ConfigBase):
+    pat_token: str
+    repo_name: str
+    repo_owner: str
+    repo_branch: Optional[str] = "master"
+    compressed_jsonl: Path
+    embeddings_file: Path
+
+
 class ImageContentConfig(ConfigBase):
     input_directories: Optional[List[Path]]
     input_filter: Optional[List[str]]
@@ -57,6 +66,7 @@ class ContentConfig(ConfigBase):
     music: Optional[TextContentConfig]
     markdown: Optional[TextContentConfig]
     pdf: Optional[TextContentConfig]
+    github: Optional[GithubContentConfig]
     plugins: Optional[Dict[str, TextContentConfig]]
 
 

From 08d79f5ba4e57c3d32c76dbb896559508639dba0 Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Tue, 13 Jun 2023 15:52:36 -0700
Subject: [PATCH 02/19] Unify types used in Github and other text-based
 configs. Fix typing issues

---
 src/khoj/configure.py                        |  1 +
 src/khoj/interface/desktop/main_window.py    |  4 +++-
 src/khoj/processor/github/github_to_jsonl.py |  4 ++--
 src/khoj/processor/text_to_jsonl.py          |  4 ++--
 src/khoj/routers/api.py                      |  5 ++++-
 src/khoj/utils/rawconfig.py                  | 13 +++++++------
 6 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index bf2de2e2..f9735fea 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -49,6 +49,7 @@ def configure_server(args, required=False):
     # Initialize the search type and model from Config
     state.search_index_lock.acquire()
     state.SearchType = configure_search_types(state.config)
+    state.model = SearchModels()
     state.model = configure_search(state.model, state.config, args.regenerate)
     state.search_index_lock.release()
 
diff --git a/src/khoj/interface/desktop/main_window.py b/src/khoj/interface/desktop/main_window.py
index 6fc061bd..5a3df3ec 100644
--- a/src/khoj/interface/desktop/main_window.py
+++ b/src/khoj/interface/desktop/main_window.py
@@ -163,7 +163,9 @@ class MainWindow(QtWidgets.QMainWindow):
         processor_type_layout = QtWidgets.QVBoxLayout(processor_type_settings)
         enable_conversation = ProcessorCheckBox(f"Conversation", processor_type)
         # Add file browser to set input files for given processor type
-        input_field = LabelledTextField("OpenAI API Key", processor_type, current_openai_api_key)
+        input_field = LabelledTextField(
+            "OpenAI API Key", processor_type=processor_type, default_value=current_openai_api_key
+        )
 
         # Set enabled/disabled based on checkbox state
         enable_conversation.setChecked(current_openai_api_key is not None)
diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 6886d9a9..b989c12f 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -11,9 +11,9 @@ from khoj.utils import state
 logger = logging.getLogger(__name__)
 
 
-class GithubToJsonl:
+class GithubToJsonl(TextToJsonl):
     def __init__(self, config: GithubContentConfig):
-        self.config = config
+        super().__init__(config)
         download_loader("GithubRepositoryReader")
 
     def process(self, previous_entries=None):
diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_jsonl.py
index d85d6998..f7bca376 100644
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@@ -6,14 +6,14 @@ from typing import Callable, List, Tuple
 from khoj.utils.helpers import timer
 
 # Internal Packages
-from khoj.utils.rawconfig import Entry, TextContentConfig
+from khoj.utils.rawconfig import Entry, TextConfigBase
 
 
 logger = logging.getLogger(__name__)
 
 
 class TextToJsonl(ABC):
-    def __init__(self, config: TextContentConfig):
+    def __init__(self, config: TextConfigBase):
         self.config = config
 
     @abstractmethod
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 1f98496c..dec496d6 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -43,7 +43,10 @@ def get_config_types():
     return [
         search_type.value
         for search_type in SearchType
-        if search_type.value in configured_content_types
+        if (
+            search_type.value in configured_content_types
+            and getattr(state.model, f"{search_type.value}_search") is not None
+        )
         or ("plugins" in configured_content_types and search_type.name in configured_content_types["plugins"])
     ]
 
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 371918b6..21ff93d5 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -16,11 +16,14 @@ class ConfigBase(BaseModel):
         allow_population_by_field_name = True
 
 
-class TextContentConfig(ConfigBase):
-    input_files: Optional[List[Path]]
-    input_filter: Optional[List[str]]
+class TextConfigBase(ConfigBase):
     compressed_jsonl: Path
     embeddings_file: Path
+
+
+class TextContentConfig(TextConfigBase):
+    input_files: Optional[List[Path]]
+    input_filter: Optional[List[str]]
     index_heading_entries: Optional[bool] = False
 
     @validator("input_filter")
@@ -32,13 +35,11 @@ class TextContentConfig(ConfigBase):
         return input_filter
 
 
-class GithubContentConfig(ConfigBase):
+class GithubContentConfig(TextConfigBase):
     pat_token: str
     repo_name: str
     repo_owner: str
     repo_branch: Optional[str] = "master"
-    compressed_jsonl: Path
-    embeddings_file: Path
 
 
 class ImageContentConfig(ConfigBase):

From 019d3732deb484c1f6b8aa55feb750c366cbfb30 Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Tue, 13 Jun 2023 16:06:54 -0700
Subject: [PATCH 03/19] Rename orgmode_search to org_search

---
 src/khoj/configure.py    | 2 +-
 src/khoj/routers/api.py  | 4 ++--
 src/khoj/utils/config.py | 2 +-
 tests/test_client.py     | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index f9735fea..6bb844cc 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -91,7 +91,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
     if (t == state.SearchType.Org or t == None) and config.content_type.org:
         logger.info("🦄 Setting up search for orgmode notes")
         # Extract Entries, Generate Notes Embeddings
-        model.orgmode_search = text_search.setup(
+        model.org_search = text_search.setup(
             OrgToJsonl,
             config.content_type.org,
             search_config=config.search_type.asymmetric,
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index dec496d6..7e80b827 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -91,11 +91,11 @@ def search(
         logger.debug(f"Return response from query cache")
         return state.query_cache[query_cache_key]
 
-    if (t == SearchType.Org or t == None) and state.model.orgmode_search:
+    if (t == SearchType.Org or t == None) and state.model.org_search:
         # query org-mode notes
         with timer("Query took", logger):
             hits, entries = text_search.query(
-                user_query, state.model.orgmode_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
+                user_query, state.model.org_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
             )
 
         # collate and return results
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index cc461855..a83f7814 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -59,7 +59,7 @@ class ImageSearchModel:
 
 @dataclass
 class SearchModels:
-    orgmode_search: TextSearchModel = None
+    org_search: TextSearchModel = None
     ledger_search: TextSearchModel = None
     music_search: TextSearchModel = None
     markdown_search: TextSearchModel = None
diff --git a/tests/test_client.py b/tests/test_client.py
index cee0ee67..d74b4f2d 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -166,7 +166,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
 # ----------------------------------------------------------------------------------------------------
 def test_notes_search(client, content_config: ContentConfig, search_config: SearchConfig):
     # Arrange
-    model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
+    model.org_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
     user_query = quote("How to git install application?")
 
     # Act
@@ -183,7 +183,7 @@ def test_notes_search(client, content_config: ContentConfig, search_config: Sear
 def test_notes_search_with_only_filters(client, content_config: ContentConfig, search_config: SearchConfig):
     # Arrange
     filters = [WordFilter(), FileFilter()]
-    model.orgmode_search = text_search.setup(
+    model.org_search = text_search.setup(
         OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
     )
     user_query = quote('+"Emacs" file:"*.org"')
@@ -202,7 +202,7 @@ def test_notes_search_with_only_filters(client, content_config: ContentConfig, s
 def test_notes_search_with_include_filter(client, content_config: ContentConfig, search_config: SearchConfig):
     # Arrange
     filters = [WordFilter()]
-    model.orgmode_search = text_search.setup(
+    model.org_search = text_search.setup(
         OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
     )
     user_query = quote('How to git install application? +"Emacs"')
@@ -221,7 +221,7 @@ def test_notes_search_with_include_filter(client, content_config: ContentConfig,
 def test_notes_search_with_exclude_filter(client, content_config: ContentConfig, search_config: SearchConfig):
     # Arrange
     filters = [WordFilter()]
-    model.orgmode_search = text_search.setup(
+    model.org_search = text_search.setup(
         OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters
     )
     user_query = quote('How to git install application? -"clone"')

From 3a61919344e7ab1955a0b539c8fe98fc95ee4aea Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Tue, 13 Jun 2023 16:32:47 -0700
Subject: [PATCH 04/19] Fix failing unit tests by hard-coding model presence of
 expected search types

---
 tests/conftest.py                | 4 ++++
 tests/test_conversation_utils.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 84ec658d..6ef2394f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -159,6 +159,10 @@ def client(content_config: ContentConfig, search_config: SearchConfig, processor
     state.config.search_type = search_config
     state.SearchType = configure_search_types(state.config)
 
+    # These lines help us Mock the Search models for these search types
+    state.model.org_search = {}
+    state.model.image_search = {}
+
     configure_routes(app)
     return TestClient(app)
 
diff --git a/tests/test_conversation_utils.py b/tests/test_conversation_utils.py
index 06a507c5..ac8a7665 100644
--- a/tests/test_conversation_utils.py
+++ b/tests/test_conversation_utils.py
@@ -32,7 +32,7 @@ class TestTruncateMessage:
 
     def test_truncate_message_first_large(self):
         chat_messages = ChatMessageFactory.build_batch(25)
-        big_chat_message = ChatMessageFactory.build(content=factory.Faker("paragraph", nb_sentences=1000))
+        big_chat_message = ChatMessageFactory.build(content=factory.Faker("paragraph", nb_sentences=2000))
         big_chat_message.content = big_chat_message.content + "\n" + "Question?"
         copy_big_chat_message = big_chat_message.copy()
         chat_messages.insert(0, big_chat_message)

From 751edfefe5e868b7cd9c7f7ee193ec7ea9a64df4 Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Tue, 13 Jun 2023 16:55:58 -0700
Subject: [PATCH 05/19] Add separate unit test for github. Will only run of a
 PAT token is set

---
 README.md                 |  7 ++++++-
 tests/conftest.py         | 10 ++++++++++
 tests/test_text_search.py | 13 +++++++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5e8feb45..96aebc6c 100644
--- a/README.md
+++ b/README.md
@@ -328,6 +328,11 @@ Add your OpenAI API to Khoj by using either of the two options below:
   1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
   2. Interact with them from the [Khoj Swagger docs](http://locahost:8000/docs)[^2]
 
+### Use a Github Repository as a source
+Note that this plugin is currently *only* indexing Markdown files. It will ignore all other files in the repository. This is because Khoj, as it stands, is a semantic search engine. Eventually, we hope to get to a state where you can search for any file in your repository and even explain code.
+
+1. Get a [pat token](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token) with `repo` and `read:org` scopes in the classic flow.
+2. Configure your settings to include the `owner` and `repo_name`. The `owner` will be the organization name if the repo is in an organization. The `repo_name` will be the name of the repository. Optionally, you can also supply a branch name. If no branch name is supplied, the `master` branch will be used.
 
 ## Performance
 
@@ -458,7 +463,7 @@ conda activate khoj
 
 #### Before Creating PR
 
-1. Run Tests
+1. Run Tests. If you get an error complaining about a missing `fast_tokenizer_file`, follow the solution [in this Github issue](https://github.com/UKPLab/sentence-transformers/issues/1659).
    ```shell
    pytest
    ```
diff --git a/tests/conftest.py b/tests/conftest.py
index 6ef2394f..e061f279 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -16,6 +16,7 @@ from khoj.utils.rawconfig import (
     ConversationProcessorConfig,
     ProcessorConfig,
     TextContentConfig,
+    GithubContentConfig,
     ImageContentConfig,
     SearchConfig,
     TextSearchConfig,
@@ -89,6 +90,15 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
         )
     }
 
+    content_config.github = GithubContentConfig(
+        pat_token=os.getenv("GITHUB_PAT_TOKEN"),
+        repo_name="lantern",
+        repo_owner="khoj-ai",
+        repo_branch="master",
+        compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
+        embeddings_file=content_dir.joinpath("github_embeddings.pt"),
+    )
+
     filters = [DateFilter(), WordFilter(), FileFilter()]
     text_search.setup(
         JsonlToJsonl, content_config.plugins["plugin1"], search_config.asymmetric, regenerate=False, filters=filters
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 830feb9b..6eecac07 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -1,6 +1,7 @@
 # System Packages
 import logging
 from pathlib import Path
+import os
 
 # External Packages
 import pytest
@@ -10,6 +11,7 @@ from khoj.utils.state import model
 from khoj.search_type import text_search
 from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
+from khoj.processor.github.github_to_jsonl import GithubToJsonl
 
 
 # Test
@@ -170,3 +172,14 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
     # Cleanup
     # reset input_files in config to empty list
     content_config.org.input_files = []
+
+
+# ----------------------------------------------------------------------------------------------------
+@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
+def test_asymmetric_setup_github(content_config: ContentConfig, search_config: SearchConfig):
+    # Act
+    # Regenerate notes embeddings during asymmetric setup
+    github_model = text_search.setup(GithubToJsonl, content_config.github, search_config.asymmetric, regenerate=True)
+
+    # Assert
+    assert len(github_model.entries) > 1

From 07ade2262a18277a30ca6073e7cac71398391116 Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Tue, 13 Jun 2023 17:03:03 -0700
Subject: [PATCH 06/19] Set default value of pat_token in conftest.py to be
 empty string

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e061f279..d4638adb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -91,7 +91,7 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
     }
 
     content_config.github = GithubContentConfig(
-        pat_token=os.getenv("GITHUB_PAT_TOKEN"),
+        pat_token=os.getenv("GITHUB_PAT_TOKEN", ""),
         repo_name="lantern",
         repo_owner="khoj-ai",
         repo_branch="master",

From ac96f43b1bb04455f2b40b2a242b02618a51df66 Mon Sep 17 00:00:00 2001
From: Saba <narmiabas@gmail.com>
Date: Fri, 16 Jun 2023 23:46:25 -0700
Subject: [PATCH 07/19] Remove try-catch specific to Github plugin; consolidate
 GUI logic

---
 src/khoj/configure.py                     | 18 ++++++--------
 src/khoj/interface/desktop/main_window.py | 30 ++++++++---------------
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 6bb844cc..38a29efb 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -49,7 +49,6 @@ def configure_server(args, required=False):
     # Initialize the search type and model from Config
     state.search_index_lock.acquire()
     state.SearchType = configure_search_types(state.config)
-    state.model = SearchModels()
     state.model = configure_search(state.model, state.config, args.regenerate)
     state.search_index_lock.release()
 
@@ -158,16 +157,13 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
     if (t == state.SearchType.Github or t == None) and config.content_type.github:
         logger.info("🐙 Setting up search for github")
         # Extract Entries, Generate Github Embeddings
-        try:
-            model.github_search = text_search.setup(
-                GithubToJsonl,
-                config.content_type.github,
-                search_config=config.search_type.asymmetric,
-                regenerate=regenerate,
-                filters=[DateFilter(), WordFilter(), FileFilter()],
-            )
-        except Exception as e:
-            logger.error(f"Failed to setup github search: {e}")
+        model.github_search = text_search.setup(
+            GithubToJsonl,
+            config.content_type.github,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[DateFilter(), WordFilter(), FileFilter()],
+        )
 
     # Initialize External Plugin Search
     if (t == None or t in state.SearchType) and config.content_type.plugins:
diff --git a/src/khoj/interface/desktop/main_window.py b/src/khoj/interface/desktop/main_window.py
index 5a3df3ec..24239772 100644
--- a/src/khoj/interface/desktop/main_window.py
+++ b/src/khoj/interface/desktop/main_window.py
@@ -119,28 +119,18 @@ class MainWindow(QtWidgets.QMainWindow):
         # Add labelled text input field
         input_fields = []
 
-        pat_token = current_content_config.get("pat-token", None)
-        input_field = LabelledTextField("pat-token", search_type=search_type, default_value=pat_token)
-        search_type_layout.addWidget(input_field)
-        input_fields += [input_field]
-
-        repo_name = current_content_config.get("repo-name", None)
-        input_field = LabelledTextField("repo-name", search_type=search_type, default_value=repo_name)
-        search_type_layout.addWidget(input_field)
-        input_fields += [input_field]
-
-        repo_owner = current_content_config.get("repo-owner", None)
-        input_field = LabelledTextField("repo-owner", search_type=search_type, default_value=repo_owner)
-        search_type_layout.addWidget(input_field)
-        input_fields += [input_field]
-
-        repo_branch = current_content_config.get("repo-branch", None)
-        input_field = LabelledTextField("repo-branch", search_type=search_type, default_value=repo_branch)
-        search_type_layout.addWidget(input_field)
-        input_fields += [input_field]
+        fields = ["pat-token", "repo-name", "repo-owner", "repo-branch"]
+        active = False
+        for field in fields:
+            field_value = current_content_config.get(field, None)
+            input_field = LabelledTextField(field, search_type=search_type, default_value=field_value)
+            search_type_layout.addWidget(input_field)
+            input_fields += [input_field]
+            if field_value:
+                active = True
 
         # Set enabled/disabled based on checkbox state
-        enable_search_type.setChecked(bool(repo_name or repo_owner or repo_branch or pat_token))
+        enable_search_type.setChecked(active)
         for input_field in input_fields:
             input_field.setEnabled(enable_search_type.isChecked())
         enable_search_type.stateChanged.connect(lambda _: [input_field.setEnabled(enable_search_type.isChecked()) for input_field in input_fields])  # type: ignore[attr-defined]

From c29c141a7e979f843090a19f1c79f85f7e586433 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 01:39:57 -0700
Subject: [PATCH 08/19] Use Github Rest API to index Markdown files in Github
 Repository

The Llama_Hub Github plugin is fairly limited.

The Github Rest API is well supported and can easily be extended to
index commit messages, issues, discussions, PRs etc.
---
 pyproject.toml                               |  2 +-
 src/khoj/processor/github/github_to_jsonl.py | 57 +++++++++++---------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index db152d29..f44849ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ dependencies = [
     "aiohttp == 3.8.4",
     "langchain >= 0.0.187",
     "pypdf >= 3.9.0",
-    "llama-hub==0.0.3",
+    "requests >= 2.26.0",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index b989c12f..fa8eb8a1 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,12 +1,16 @@
+# Standard Packages
 import logging
-from llama_index import download_loader
+
+# External Packages
+import requests
+
+# Internal Packages
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import GithubContentConfig
-from llama_hub.github_repo import GithubRepositoryReader, GithubClient
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
-from khoj.utils import state
+
 
 logger = logging.getLogger(__name__)
 
@@ -14,18 +18,11 @@ logger = logging.getLogger(__name__)
 class GithubToJsonl(TextToJsonl):
     def __init__(self, config: GithubContentConfig):
         super().__init__(config)
-        download_loader("GithubRepositoryReader")
+        self.config = config
+        self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
 
     def process(self, previous_entries=None):
-        try:
-            self.initialize()
-        except Exception as e:
-            logger.error(
-                f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
-            )
-            raise e
-
-        with timer("Download github repo", logger):
+        with timer("Download markdown files from github repo", logger):
             try:
                 docs = self.get_markdown_files()
             except Exception as e:
@@ -64,19 +61,27 @@ class GithubToJsonl(TextToJsonl):
 
         return entries_with_ids
 
-    def initialize(self):
-        logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
-        github_client = GithubClient(self.config.pat_token)
-        self.loader = GithubRepositoryReader(
-            github_client,
-            owner=self.config.repo_owner,
-            repo=self.config.repo_name,
-            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
-            verbose=state.verbose > 1,
-        )
-
     def get_markdown_files(self):
-        return self.loader.load_data(branch=self.config.repo_branch)
+        # set the url to get the contents of the repository
+        repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
+        # set the headers to include the authentication token
+        headers = {"Authorization": f"{self.config.pat_token}"}
+
+        # get the contents of the repository
+        response = requests.get(repo_content_url, headers=headers)
+        contents = response.json()
+
+        markdown_files = []
+        for item in contents["tree"]:
+            # Find all markdown files in the repository
+            if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Get text from each markdown file
+                file_content_url = f'{self.repo_url}/contents/{item["path"]}'
+                headers["Accept"] = "application/vnd.github.v3.raw"
+                markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
+                markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
+
+        return markdown_files
 
     @staticmethod
     def extract_markdown_entries(markdown_files):
@@ -84,6 +89,6 @@ class GithubToJsonl(TextToJsonl):
         entry_to_file_map = []
         for doc in markdown_files:
             entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
-                doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
+                doc["content"], doc["path"], entries, entry_to_file_map
             )
         return entries, dict(entry_to_file_map)

From 31d17d0b22ce7cbc2337ee48dedd885c757f715e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 02:50:58 -0700
Subject: [PATCH 09/19] Index commits message from repository with the github
 plugin

---
 src/khoj/processor/github/github_to_jsonl.py | 34 +++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index fa8eb8a1..bcadd09b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,12 +1,13 @@
 # Standard Packages
 import logging
+from typing import Dict, List
 
 # External Packages
 import requests
 
 # Internal Packages
 from khoj.utils.helpers import timer
-from khoj.utils.rawconfig import GithubContentConfig
+from khoj.utils.rawconfig import Entry, GithubContentConfig
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
@@ -36,6 +37,9 @@ class GithubToJsonl(TextToJsonl):
                 *GithubToJsonl.extract_markdown_entries(docs)
             )
 
+        with timer("Extract commit messages from github repo", logger):
+            current_entries += self.convert_commits_to_entries(self.get_commits())
+
         with timer("Split entries by max token size supported by model", logger):
             current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
 
@@ -83,6 +87,34 @@ class GithubToJsonl(TextToJsonl):
 
         return markdown_files
 
+    def get_commits(self) -> List[Dict]:
+        # Get commit messages from the repository using the Github API
+        headers = {"Authorization": f"{self.config.pat_token}"}
+        response = requests.get(f"{self.repo_url}/commits", headers=headers)
+        raw_commits = response.json()
+
+        # Extract commit messages from the response
+        commits = []
+        for commit in raw_commits:
+            commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+        return commits
+
+    def convert_commits_to_entries(self, commits) -> List[Entry]:
+        entries: List[Entry] = []
+        for commit in commits:
+            compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
+            entries.append(
+                Entry(
+                    compiled=compiled,
+                    raw=f'### {commit["content"]}',
+                    heading=commit["content"].split("\n")[0],
+                    file=commit["path"],
+                )
+            )
+
+        return entries
+
     @staticmethod
     def extract_markdown_entries(markdown_files):
         entries = []

From 0c1c7583b58ec299bbc40b5f34d512bcacedf812 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 03:38:16 -0700
Subject: [PATCH 10/19] Handle pagination, API rate limits. Get all commits
 from Github repo

---
 src/khoj/processor/github/github_to_jsonl.py | 35 ++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index bcadd09b..f862c951 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,5 +1,6 @@
 # Standard Packages
 import logging
+import time
 from typing import Dict, List
 
 # External Packages
@@ -75,6 +76,13 @@ class GithubToJsonl(TextToJsonl):
         response = requests.get(repo_content_url, headers=headers)
         contents = response.json()
 
+        # If the rate limit is reached, wait for the reset time
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+            time.sleep(wait_time)
+            return self.get_markdown_files()
+
         markdown_files = []
         for item in contents["tree"]:
             # Find all markdown files in the repository
@@ -90,13 +98,28 @@ class GithubToJsonl(TextToJsonl):
     def get_commits(self) -> List[Dict]:
         # Get commit messages from the repository using the Github API
         headers = {"Authorization": f"{self.config.pat_token}"}
-        response = requests.get(f"{self.repo_url}/commits", headers=headers)
-        raw_commits = response.json()
-
-        # Extract commit messages from the response
+        commits_url = f"{self.repo_url}/commits"
         commits = []
-        for commit in raw_commits:
-            commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+        while commits_url is not None:
+            # Get the next page of commits
+            response = requests.get(commits_url, headers=headers)
+
+            # If the rate limit is reached, wait for the reset time
+            if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+                wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+                logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+                time.sleep(wait_time)
+                continue
+
+            raw_commits = response.json()
+
+            # Extract commit messages from the response
+            for commit in raw_commits:
+                commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+            # Get the URL for the next page of commits, if any
+            commits_url = response.links.get("next", {}).get("url")
 
         return commits
 

From 63ec84ad782cad2287f641895bcc16622444d8a5 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 04:23:01 -0700
Subject: [PATCH 11/19] Store Github URL of Markdown files on Github in file
 jsonl param

---
 src/khoj/processor/github/github_to_jsonl.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index f862c951..d21f688b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -87,11 +87,16 @@ class GithubToJsonl(TextToJsonl):
         for item in contents["tree"]:
             # Find all markdown files in the repository
             if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Create URL for each markdown file on Github
+                url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
+
                 # Get text from each markdown file
                 file_content_url = f'{self.repo_url}/contents/{item["path"]}'
                 headers["Accept"] = "application/vnd.github.v3.raw"
                 markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
-                markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
+
+                # Add markdown file contents and URL to list
+                markdown_files += [{"content": markdown_file_contents, "path": url_path}]
 
         return markdown_files
 

From 3f24e53b6e2fd60f88e7988bea3ee5a080cfe206 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 04:26:40 -0700
Subject: [PATCH 12/19] Render URL as link in web interface if file param of
 result is a web link

---
 src/khoj/interface/web/index.html | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index 388c0207..d9874072 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -34,6 +34,9 @@
         function render_markdown(query, data) {
             var md = window.markdownit();
             return md.render(data.map(function (item) {
+                lines = item.entry.split("\n")
+                if (item.additional.file.startsWith("http"))
+                    return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`
                 return `${item.entry}`
             }).join("\n"));
         }

From 10d4c38ce9e9bcb2dc72aea03f15fbfe91b877be Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:06:46 -0700
Subject: [PATCH 13/19] Extract Wait for rate limit reset logic into a function
 for reuse

---
 src/khoj/processor/github/github_to_jsonl.py | 33 +++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index d21f688b..f622b5e9 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -23,6 +23,16 @@ class GithubToJsonl(TextToJsonl):
         self.config = config
         self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
 
+    @staticmethod
+    def wait_for_rate_limit_reset(response, func, *args, **kwargs):
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+            time.sleep(wait_time)
+            return func(*args, **kwargs)
+        else:
+            return
+
     def process(self, previous_entries=None):
         with timer("Download markdown files from github repo", logger):
             try:
@@ -76,12 +86,10 @@ class GithubToJsonl(TextToJsonl):
         response = requests.get(repo_content_url, headers=headers)
         contents = response.json()
 
-        # If the rate limit is reached, wait for the reset time
-        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
-            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
-            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
-            time.sleep(wait_time)
-            return self.get_markdown_files()
+        # Wait for rate limit reset if needed
+        result = self.wait_for_rate_limit_reset(response, self.get_markdown_files)
+        if result is not None:
+            return result
 
         markdown_files = []
         for item in contents["tree"]:
@@ -109,16 +117,13 @@ class GithubToJsonl(TextToJsonl):
         while commits_url is not None:
             # Get the next page of commits
             response = requests.get(commits_url, headers=headers)
-
-            # If the rate limit is reached, wait for the reset time
-            if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
-                wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
-                logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
-                time.sleep(wait_time)
-                continue
-
             raw_commits = response.json()
 
+            # Wait for rate limit reset if needed
+            result = self.wait_for_rate_limit_reset(response, self.get_commits)
+            if result is not None:
+                return result
+
             # Extract commit messages from the response
             for commit in raw_commits:
                 commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]

From 9c70af960ca2d1c5dd357ef4a1dcf085833d1a2c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:08:57 -0700
Subject: [PATCH 14/19] Extract logic to get file content from Github into a
 separate method

---
 src/khoj/processor/github/github_to_jsonl.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index f622b5e9..f29bef2b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -98,16 +98,23 @@ class GithubToJsonl(TextToJsonl):
                 # Create URL for each markdown file on Github
                 url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
 
-                # Get text from each markdown file
-                file_content_url = f'{self.repo_url}/contents/{item["path"]}'
-                headers["Accept"] = "application/vnd.github.v3.raw"
-                markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
-
                 # Add markdown file contents and URL to list
-                markdown_files += [{"content": markdown_file_contents, "path": url_path}]
+                markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
 
         return markdown_files
 
+    def get_file_contents(self, file_url):
+        # Get text from each markdown file
+        headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
+        response = requests.get(file_url, headers=headers)
+
+        # Wait for rate limit reset if needed
+        result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
+        if result is not None:
+            return result
+
+        return response.content.decode("utf-8")
+
     def get_commits(self) -> List[Dict]:
         # Get commit messages from the repository using the Github API
         headers = {"Authorization": f"{self.config.pat_token}"}

From 87975e589aa562521e24cf55588e756bb2623b6b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:12:47 -0700
Subject: [PATCH 15/19] Fix passing auth token to Github API to increase rate
 limits by x85

- Previously wasn't prefixing "token" to PAT token in Auth header
  This resulted in the request being considered unauthenticated

- Unauthenticated requests to Github API are limited to 60 requests/hour
  Authenticated requests to Github API are allowed 5000 requests/hour
---
 src/khoj/processor/github/github_to_jsonl.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index f29bef2b..d76f4979 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -77,12 +77,9 @@ class GithubToJsonl(TextToJsonl):
         return entries_with_ids
 
     def get_markdown_files(self):
-        # set the url to get the contents of the repository
+        # Get the contents of the repository
         repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
-        # set the headers to include the authentication token
-        headers = {"Authorization": f"{self.config.pat_token}"}
-
-        # get the contents of the repository
+        headers = {"Authorization": f"token {self.config.pat_token}"}
         response = requests.get(repo_content_url, headers=headers)
         contents = response.json()
 
@@ -91,6 +88,7 @@ class GithubToJsonl(TextToJsonl):
         if result is not None:
             return result
 
+        # Extract markdown files from the repository
         markdown_files = []
         for item in contents["tree"]:
             # Find all markdown files in the repository
@@ -105,7 +103,7 @@ class GithubToJsonl(TextToJsonl):
 
     def get_file_contents(self, file_url):
         # Get text from each markdown file
-        headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
+        headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
         response = requests.get(file_url, headers=headers)
 
         # Wait for rate limit reset if needed
@@ -117,8 +115,8 @@ class GithubToJsonl(TextToJsonl):
 
     def get_commits(self) -> List[Dict]:
         # Get commit messages from the repository using the Github API
-        headers = {"Authorization": f"{self.config.pat_token}"}
         commits_url = f"{self.repo_url}/commits"
+        headers = {"Authorization": f"token {self.config.pat_token}"}
         commits = []
 
         while commits_url is not None:

From 6fdac2441652d1fb6bcb0cfda413a32ced7ba24f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:20:05 -0700
Subject: [PATCH 16/19] Set page size to 100 to reduce requests required to
 Github API to 1/3

- Default is 30. So number of paginated requests required to get all
  items (commits, files) will reduce by 67%

- No need to increase page size for the get tree Github API request from
  `get_markdown_files'

  Get tree Github API doesn't support pagination and return 100K items
  in response. This should be way more than enough for our current
  use-cases
---
 src/khoj/processor/github/github_to_jsonl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index d76f4979..789d8259 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -117,11 +117,12 @@ class GithubToJsonl(TextToJsonl):
         # Get commit messages from the repository using the Github API
         commits_url = f"{self.repo_url}/commits"
         headers = {"Authorization": f"token {self.config.pat_token}"}
+        params = {"per_page": 100}
         commits = []
 
         while commits_url is not None:
             # Get the next page of commits
-            response = requests.get(commits_url, headers=headers)
+            response = requests.get(commits_url, headers=headers, params=params)
             raw_commits = response.json()
 
             # Wait for rate limit reset if needed

From e31a540a5efcb9235629fd63167054e8dd9c95c7 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:47:15 -0700
Subject: [PATCH 17/19] Get all md files recursively in repository by passing
 recursive param

Previously the `get_markdown_files' method was only getting files at
root of the repository

Fix, improve logger messages in github to jsonl processor
---
 src/khoj/processor/github/github_to_jsonl.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 789d8259..80d55f38 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -38,10 +38,10 @@ class GithubToJsonl(TextToJsonl):
             try:
                 docs = self.get_markdown_files()
             except Exception as e:
-                logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}")
+                logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}")
                 raise e
 
-        logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}")
+        logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}")
 
         with timer("Extract markdown entries from github repo", logger):
             current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
@@ -63,7 +63,7 @@ class GithubToJsonl(TextToJsonl):
                     current_entries, previous_entries, key="compiled", logger=logger
                 )
 
-        with timer("Write markdown entries to JSONL file", logger):
+        with timer("Write github entries to JSONL file", logger):
             # Process Each Entry from All Notes Files
             entries = list(map(lambda entry: entry[1], entries_with_ids))
             jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
@@ -80,7 +80,8 @@ class GithubToJsonl(TextToJsonl):
         # Get the contents of the repository
         repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
         headers = {"Authorization": f"token {self.config.pat_token}"}
-        response = requests.get(repo_content_url, headers=headers)
+        params = {"recursive": "true"}
+        response = requests.get(repo_content_url, headers=headers, params=params)
         contents = response.json()
 
         # Wait for rate limit reset if needed

From 595cc5b0f5790d2ed63d08ec879431c380585047 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 02:24:25 -0700
Subject: [PATCH 18/19] Use printer icon for PDF logs. Only split lines if file
 at web link in web interface

---
 src/khoj/configure.py             | 2 +-
 src/khoj/interface/web/index.html | 9 +++++----
 tests/test_text_search.py         | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 38a29efb..3aa39f10 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -136,7 +136,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
 
     # Initialize PDF Search
     if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
-        logger.info("💸 Setting up search for pdf")
+        logger.info("🖨️ Setting up search for pdf")
         # Extract Entries, Generate PDF Embeddings
         model.pdf_search = text_search.setup(
             PdfToJsonl,
diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index d9874072..7ed321e1 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -34,10 +34,11 @@
         function render_markdown(query, data) {
             var md = window.markdownit();
             return md.render(data.map(function (item) {
-                lines = item.entry.split("\n")
-                if (item.additional.file.startsWith("http"))
-                    return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`
-                return `${item.entry}`
+                if (item.additional.file.startsWith("http")) {
+                    lines = item.entry.split("\n");
+                    return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`;
+                }
+                return `${item.entry}`;
             }).join("\n"));
         }
 
diff --git a/tests/test_text_search.py b/tests/test_text_search.py
index 6eecac07..6634a671 100644
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@@ -178,7 +178,7 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
 @pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
 def test_asymmetric_setup_github(content_config: ContentConfig, search_config: SearchConfig):
     # Act
-    # Regenerate notes embeddings during asymmetric setup
+    # Regenerate github embeddings to test asymmetric setup without caching
     github_model = text_search.setup(GithubToJsonl, content_config.github, search_config.asymmetric, regenerate=True)
 
     # Assert

From a44cde2865aae68f04e510a571b52a7760d6c829 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 04:51:25 -0700
Subject: [PATCH 19/19] Show hint to re-index vault if wonky results in
 Obsidian search modal

Remove spurious indentation in Obsidian styles.css

Resolves #207
---
 src/interface/obsidian/src/search_modal.ts |  9 ++++++++
 src/interface/obsidian/styles.css          | 26 ++++++++++++++--------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/interface/obsidian/src/search_modal.ts b/src/interface/obsidian/src/search_modal.ts
index 4bad70f6..84ebeaa4 100644
--- a/src/interface/obsidian/src/search_modal.ts
+++ b/src/interface/obsidian/src/search_modal.ts
@@ -127,6 +127,15 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
         let entry_snipped_indicator = result.entry.split('\n').length > lines_to_render ? ' **...**' : '';
         let snipped_entry = result.entry.split('\n').slice(0, lines_to_render).join('\n');
 
+        // Show reindex hint on first search result
+        if (this.resultContainerEl.children.length == 1) {
+            let infoHintEl = createEl("div",{ cls: 'khoj-info-hint' });
+            el.insertAdjacentElement("beforebegin", infoHintEl);
+            setTimeout(() => {
+                infoHintEl.setText('Unexpected results? Try re-index your vault from the Khoj plugin settings to fix it.');
+            }, 3000);
+        }
+
         // Show filename of each search result for context
         el.createEl("div",{ cls: 'khoj-result-file' }).setText(filename ?? "");
         let result_el = el.createEl("div", { cls: 'khoj-result-entry' })
diff --git a/src/interface/obsidian/styles.css b/src/interface/obsidian/styles.css
index e3597abe..be8065b8 100644
--- a/src/interface/obsidian/styles.css
+++ b/src/interface/obsidian/styles.css
@@ -148,9 +148,9 @@ If your plugin does not need CSS, delete this file.
 
 .khoj-result-file {
     font-weight: 600;
-  }
+}
 
-  .khoj-result-entry {
+.khoj-result-entry {
     color: var(--text-muted);
     margin-left: 2em;
     padding-left: 0.5em;
@@ -160,17 +160,25 @@ If your plugin does not need CSS, delete this file.
     border-left-style: solid;
     border-left-color: var(--color-accent-2);
     white-space: normal;
-  }
+}
 
-  .khoj-result-entry > * {
+.khoj-result-entry > * {
     font-size: var(--font-ui-medium);
-  }
+}
 
-  .khoj-result-entry > p {
+.khoj-result-entry > p {
     margin-top: 0.2em;
     margin-bottom: 0.2em;
-  }
+}
 
-  .khoj-result-entry p br {
+.khoj-result-entry p br {
     display: none;
-  }
+}
+
+.khoj-info-hint {
+    color: var(--text-muted);
+    font-size: var(--font-ui-small);
+    font-style: italic;
+    text-align: center;
+    margin-bottom: 0.5em;
+}