From c29c141a7e979f843090a19f1c79f85f7e586433 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 01:39:57 -0700
Subject: [PATCH 01/10] Use Github Rest API to index Markdown files in Github
 Repository

The Llama_Hub Github plugin is fairly limited.

The Github Rest API is well supported and can easily be extended to
index commit messages, issues, discussions, PRs etc.
---
 pyproject.toml                               |  2 +-
 src/khoj/processor/github/github_to_jsonl.py | 57 +++++++++++---------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index db152d29..f44849ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ dependencies = [
     "aiohttp == 3.8.4",
     "langchain >= 0.0.187",
     "pypdf >= 3.9.0",
-    "llama-hub==0.0.3",
+    "requests >= 2.26.0",
 ]
 dynamic = ["version"]
 
diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index b989c12f..fa8eb8a1 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,12 +1,16 @@
+# Standard Packages
 import logging
-from llama_index import download_loader
+
+# External Packages
+import requests
+
+# Internal Packages
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import GithubContentConfig
-from llama_hub.github_repo import GithubRepositoryReader, GithubClient
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
-from khoj.utils import state
+
 
 logger = logging.getLogger(__name__)
 
@@ -14,18 +18,11 @@ logger = logging.getLogger(__name__)
 class GithubToJsonl(TextToJsonl):
     def __init__(self, config: GithubContentConfig):
         super().__init__(config)
-        download_loader("GithubRepositoryReader")
+        self.config = config
+        self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
 
     def process(self, previous_entries=None):
-        try:
-            self.initialize()
-        except Exception as e:
-            logger.error(
-                f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
-            )
-            raise e
-
-        with timer("Download github repo", logger):
+        with timer("Download markdown files from github repo", logger):
             try:
                 docs = self.get_markdown_files()
             except Exception as e:
@@ -64,19 +61,27 @@ class GithubToJsonl(TextToJsonl):
 
         return entries_with_ids
 
-    def initialize(self):
-        logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
-        github_client = GithubClient(self.config.pat_token)
-        self.loader = GithubRepositoryReader(
-            github_client,
-            owner=self.config.repo_owner,
-            repo=self.config.repo_name,
-            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
-            verbose=state.verbose > 1,
-        )
-
     def get_markdown_files(self):
-        return self.loader.load_data(branch=self.config.repo_branch)
+        # set the url to get the contents of the repository
+        repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
+        # set the headers to include the authentication token
+        headers = {"Authorization": f"{self.config.pat_token}"}
+
+        # get the contents of the repository
+        response = requests.get(repo_content_url, headers=headers)
+        contents = response.json()
+
+        markdown_files = []
+        for item in contents["tree"]:
+            # Find all markdown files in the repository
+            if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Get text from each markdown file
+                file_content_url = f'{self.repo_url}/contents/{item["path"]}'
+                headers["Accept"] = "application/vnd.github.v3.raw"
+                markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
+                markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
+
+        return markdown_files
 
     @staticmethod
     def extract_markdown_entries(markdown_files):
@@ -84,6 +89,6 @@ class GithubToJsonl(TextToJsonl):
         entry_to_file_map = []
         for doc in markdown_files:
             entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
-                doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
+                doc["content"], doc["path"], entries, entry_to_file_map
             )
         return entries, dict(entry_to_file_map)

From 31d17d0b22ce7cbc2337ee48dedd885c757f715e Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 02:50:58 -0700
Subject: [PATCH 02/10] Index commits message from repository with the github
 plugin

---
 src/khoj/processor/github/github_to_jsonl.py | 34 +++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index fa8eb8a1..bcadd09b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,12 +1,13 @@
 # Standard Packages
 import logging
+from typing import Dict, List
 
 # External Packages
 import requests
 
 # Internal Packages
 from khoj.utils.helpers import timer
-from khoj.utils.rawconfig import GithubContentConfig
+from khoj.utils.rawconfig import Entry, GithubContentConfig
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
@@ -36,6 +37,9 @@ class GithubToJsonl(TextToJsonl):
                 *GithubToJsonl.extract_markdown_entries(docs)
             )
 
+        with timer("Extract commit messages from github repo", logger):
+            current_entries += self.convert_commits_to_entries(self.get_commits())
+
         with timer("Split entries by max token size supported by model", logger):
             current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
 
@@ -83,6 +87,34 @@ class GithubToJsonl(TextToJsonl):
 
         return markdown_files
 
+    def get_commits(self) -> List[Dict]:
+        # Get commit messages from the repository using the Github API
+        headers = {"Authorization": f"{self.config.pat_token}"}
+        response = requests.get(f"{self.repo_url}/commits", headers=headers)
+        raw_commits = response.json()
+
+        # Extract commit messages from the response
+        commits = []
+        for commit in raw_commits:
+            commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+        return commits
+
+    def convert_commits_to_entries(self, commits) -> List[Entry]:
+        entries: List[Entry] = []
+        for commit in commits:
+            compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
+            entries.append(
+                Entry(
+                    compiled=compiled,
+                    raw=f'### {commit["content"]}',
+                    heading=commit["content"].split("\n")[0],
+                    file=commit["path"],
+                )
+            )
+
+        return entries
+
     @staticmethod
     def extract_markdown_entries(markdown_files):
         entries = []

From 0c1c7583b58ec299bbc40b5f34d512bcacedf812 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 03:38:16 -0700
Subject: [PATCH 03/10] Handle pagination, API rate limits. Get all commits
 from Github repo

---
 src/khoj/processor/github/github_to_jsonl.py | 35 ++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index bcadd09b..f862c951 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,5 +1,6 @@
 # Standard Packages
 import logging
+import time
 from typing import Dict, List
 
 # External Packages
@@ -75,6 +76,13 @@ class GithubToJsonl(TextToJsonl):
         response = requests.get(repo_content_url, headers=headers)
         contents = response.json()
 
+        # If the rate limit is reached, wait for the reset time
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+            time.sleep(wait_time)
+            return self.get_markdown_files()
+
         markdown_files = []
         for item in contents["tree"]:
             # Find all markdown files in the repository
@@ -90,13 +98,28 @@ class GithubToJsonl(TextToJsonl):
     def get_commits(self) -> List[Dict]:
         # Get commit messages from the repository using the Github API
         headers = {"Authorization": f"{self.config.pat_token}"}
-        response = requests.get(f"{self.repo_url}/commits", headers=headers)
-        raw_commits = response.json()
-
-        # Extract commit messages from the response
+        commits_url = f"{self.repo_url}/commits"
         commits = []
-        for commit in raw_commits:
-            commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+        while commits_url is not None:
+            # Get the next page of commits
+            response = requests.get(commits_url, headers=headers)
+
+            # If the rate limit is reached, wait for the reset time
+            if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+                wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+                logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+                time.sleep(wait_time)
+                continue
+
+            raw_commits = response.json()
+
+            # Extract commit messages from the response
+            for commit in raw_commits:
+                commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+            # Get the URL for the next page of commits, if any
+            commits_url = response.links.get("next", {}).get("url")
 
         return commits
 

From 63ec84ad782cad2287f641895bcc16622444d8a5 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 04:23:01 -0700
Subject: [PATCH 04/10] Store Github URL of Markdown files on Github in file
 jsonl param

---
 src/khoj/processor/github/github_to_jsonl.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index f862c951..d21f688b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -87,11 +87,16 @@ class GithubToJsonl(TextToJsonl):
         for item in contents["tree"]:
             # Find all markdown files in the repository
             if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Create URL for each markdown file on Github
+                url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
+
                 # Get text from each markdown file
                 file_content_url = f'{self.repo_url}/contents/{item["path"]}'
                 headers["Accept"] = "application/vnd.github.v3.raw"
                 markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
-                markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
+
+                # Add markdown file contents and URL to list
+                markdown_files += [{"content": markdown_file_contents, "path": url_path}]
 
         return markdown_files
 

From 3f24e53b6e2fd60f88e7988bea3ee5a080cfe206 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 17 Jun 2023 04:26:40 -0700
Subject: [PATCH 05/10] Render URL as link in web interface if file param of
 result is a web link

---
 src/khoj/interface/web/index.html | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index 388c0207..d9874072 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -34,6 +34,9 @@
         function render_markdown(query, data) {
             var md = window.markdownit();
             return md.render(data.map(function (item) {
+                lines = item.entry.split("\n")
+                if (item.additional.file.startsWith("http"))
+                    return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`
                 return `${item.entry}`
             }).join("\n"));
         }

From 10d4c38ce9e9bcb2dc72aea03f15fbfe91b877be Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:06:46 -0700
Subject: [PATCH 06/10] Extract Wait for rate limit reset logic into a function
 for reuse

---
 src/khoj/processor/github/github_to_jsonl.py | 33 +++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index d21f688b..f622b5e9 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -23,6 +23,16 @@ class GithubToJsonl(TextToJsonl):
         self.config = config
         self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
 
+    @staticmethod
+    def wait_for_rate_limit_reset(response, func, *args, **kwargs):
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+            time.sleep(wait_time)
+            return func(*args, **kwargs)
+        else:
+            return
+
     def process(self, previous_entries=None):
         with timer("Download markdown files from github repo", logger):
             try:
@@ -76,12 +86,10 @@ class GithubToJsonl(TextToJsonl):
         response = requests.get(repo_content_url, headers=headers)
         contents = response.json()
 
-        # If the rate limit is reached, wait for the reset time
-        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
-            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
-            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
-            time.sleep(wait_time)
-            return self.get_markdown_files()
+        # Wait for rate limit reset if needed
+        result = self.wait_for_rate_limit_reset(response, self.get_markdown_files)
+        if result is not None:
+            return result
 
         markdown_files = []
         for item in contents["tree"]:
@@ -109,16 +117,13 @@ class GithubToJsonl(TextToJsonl):
         while commits_url is not None:
             # Get the next page of commits
             response = requests.get(commits_url, headers=headers)
-
-            # If the rate limit is reached, wait for the reset time
-            if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
-                wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
-                logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
-                time.sleep(wait_time)
-                continue
-
             raw_commits = response.json()
 
+            # Wait for rate limit reset if needed
+            result = self.wait_for_rate_limit_reset(response, self.get_commits)
+            if result is not None:
+                return result
+
             # Extract commit messages from the response
             for commit in raw_commits:
                 commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]

From 9c70af960ca2d1c5dd357ef4a1dcf085833d1a2c Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:08:57 -0700
Subject: [PATCH 07/10] Extract logic to get file content from Github into a
 separate method

---
 src/khoj/processor/github/github_to_jsonl.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index f622b5e9..f29bef2b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -98,16 +98,23 @@ class GithubToJsonl(TextToJsonl):
                 # Create URL for each markdown file on Github
                 url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
 
-                # Get text from each markdown file
-                file_content_url = f'{self.repo_url}/contents/{item["path"]}'
-                headers["Accept"] = "application/vnd.github.v3.raw"
-                markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
-
                 # Add markdown file contents and URL to list
-                markdown_files += [{"content": markdown_file_contents, "path": url_path}]
+                markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
 
         return markdown_files
 
+    def get_file_contents(self, file_url):
+        # Get text from each markdown file
+        headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
+        response = requests.get(file_url, headers=headers)
+
+        # Wait for rate limit reset if needed
+        result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
+        if result is not None:
+            return result
+
+        return response.content.decode("utf-8")
+
     def get_commits(self) -> List[Dict]:
         # Get commit messages from the repository using the Github API
         headers = {"Authorization": f"{self.config.pat_token}"}

From 87975e589aa562521e24cf55588e756bb2623b6b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:12:47 -0700
Subject: [PATCH 08/10] Fix passing auth token to Github API to increase rate
 limits by x85

- Previously wasn't prefixing "token" to PAT token in Auth header
  This resulted in the request being considered unauthenticated

- Unauthenticated requests to Github API are limited to 60 requests/hour
  Authenticated requests to Github API are allowed 5000 requests/hour
---
 src/khoj/processor/github/github_to_jsonl.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index f29bef2b..d76f4979 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -77,12 +77,9 @@ class GithubToJsonl(TextToJsonl):
         return entries_with_ids
 
     def get_markdown_files(self):
-        # set the url to get the contents of the repository
+        # Get the contents of the repository
         repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
-        # set the headers to include the authentication token
-        headers = {"Authorization": f"{self.config.pat_token}"}
-
-        # get the contents of the repository
+        headers = {"Authorization": f"token {self.config.pat_token}"}
         response = requests.get(repo_content_url, headers=headers)
         contents = response.json()
 
@@ -91,6 +88,7 @@ class GithubToJsonl(TextToJsonl):
         if result is not None:
             return result
 
+        # Extract markdown files from the repository
         markdown_files = []
         for item in contents["tree"]:
             # Find all markdown files in the repository
@@ -105,7 +103,7 @@ class GithubToJsonl(TextToJsonl):
 
     def get_file_contents(self, file_url):
         # Get text from each markdown file
-        headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
+        headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
         response = requests.get(file_url, headers=headers)
 
         # Wait for rate limit reset if needed
@@ -117,8 +115,8 @@ class GithubToJsonl(TextToJsonl):
 
     def get_commits(self) -> List[Dict]:
         # Get commit messages from the repository using the Github API
-        headers = {"Authorization": f"{self.config.pat_token}"}
         commits_url = f"{self.repo_url}/commits"
+        headers = {"Authorization": f"token {self.config.pat_token}"}
         commits = []
 
         while commits_url is not None:

From 6fdac2441652d1fb6bcb0cfda413a32ced7ba24f Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:20:05 -0700
Subject: [PATCH 09/10] Set page size to 100 to reduce requests required to
 Github API to 1/3

- Default is 30. So number of paginated requests required to get all
  items (commits, files) will reduce by 67%

- No need to increase page size for the get tree Github API request from
  `get_markdown_files'

  Get tree Github API doesn't support pagination and return 100K items
  in response. This should be way more than enough for our current
  use-cases
---
 src/khoj/processor/github/github_to_jsonl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index d76f4979..789d8259 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -117,11 +117,12 @@ class GithubToJsonl(TextToJsonl):
         # Get commit messages from the repository using the Github API
         commits_url = f"{self.repo_url}/commits"
         headers = {"Authorization": f"token {self.config.pat_token}"}
+        params = {"per_page": 100}
         commits = []
 
         while commits_url is not None:
             # Get the next page of commits
-            response = requests.get(commits_url, headers=headers)
+            response = requests.get(commits_url, headers=headers, params=params)
             raw_commits = response.json()
 
             # Wait for rate limit reset if needed

From e31a540a5efcb9235629fd63167054e8dd9c95c7 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sun, 18 Jun 2023 01:47:15 -0700
Subject: [PATCH 10/10] Get all md files recursively in repository by passing
 recursive param

Previously the `get_markdown_files' method was only getting files at
root of the repository

Fix, improve logger messages in github to jsonl processor
---
 src/khoj/processor/github/github_to_jsonl.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 789d8259..80d55f38 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -38,10 +38,10 @@ class GithubToJsonl(TextToJsonl):
             try:
                 docs = self.get_markdown_files()
             except Exception as e:
-                logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}")
+                logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}")
                 raise e
 
-        logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}")
+        logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}")
 
         with timer("Extract markdown entries from github repo", logger):
             current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
@@ -63,7 +63,7 @@ class GithubToJsonl(TextToJsonl):
                     current_entries, previous_entries, key="compiled", logger=logger
                 )
 
-        with timer("Write markdown entries to JSONL file", logger):
+        with timer("Write github entries to JSONL file", logger):
             # Process Each Entry from All Notes Files
             entries = list(map(lambda entry: entry[1], entries_with_ids))
             jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
@@ -80,7 +80,8 @@ class GithubToJsonl(TextToJsonl):
         # Get the contents of the repository
         repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
         headers = {"Authorization": f"token {self.config.pat_token}"}
-        response = requests.get(repo_content_url, headers=headers)
+        params = {"recursive": "true"}
+        response = requests.get(repo_content_url, headers=headers, params=params)
         contents = response.json()
 
         # Wait for rate limit reset if needed