From c29c141a7e979f843090a19f1c79f85f7e586433 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 17 Jun 2023 01:39:57 -0700 Subject: [PATCH 01/10] Use Github Rest API to index Markdown files in Github Repository The Llama_Hub Github plugin is fairly limited. The Github Rest API is well supported and can easily be extended to index commit messages, issues, discussions, PRs etc. --- pyproject.toml | 2 +- src/khoj/processor/github/github_to_jsonl.py | 57 +++++++++++--------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index db152d29..f44849ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ dependencies = [ "aiohttp == 3.8.4", "langchain >= 0.0.187", "pypdf >= 3.9.0", - "llama-hub==0.0.3", + "requests >= 2.26.0", ] dynamic = ["version"] diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index b989c12f..fa8eb8a1 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -1,12 +1,16 @@ +# Standard Packages import logging -from llama_index import download_loader + +# External Packages +import requests + +# Internal Packages from khoj.utils.helpers import timer from khoj.utils.rawconfig import GithubContentConfig -from llama_hub.github_repo import GithubRepositoryReader, GithubClient from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data -from khoj.utils import state + logger = logging.getLogger(__name__) @@ -14,18 +18,11 @@ logger = logging.getLogger(__name__) class GithubToJsonl(TextToJsonl): def __init__(self, config: GithubContentConfig): super().__init__(config) - download_loader("GithubRepositoryReader") + self.config = config + self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}" def process(self, previous_entries=None): - try: - self.initialize() - except Exception as e: - logger.error( - f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}" - ) - raise e - - with timer("Download github repo", logger): + with timer("Download markdown files from github repo", logger): try: docs = self.get_markdown_files() except Exception as e: @@ -64,19 +61,27 @@ class GithubToJsonl(TextToJsonl): return entries_with_ids - def initialize(self): - logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}") - github_client = GithubClient(self.config.pat_token) - self.loader = GithubRepositoryReader( - github_client, - owner=self.config.repo_owner, - repo=self.config.repo_name, - filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE), - verbose=state.verbose > 1, - ) - def get_markdown_files(self): - return self.loader.load_data(branch=self.config.repo_branch) + # set the url to get the contents of the repository + repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}" + # set the headers to include the authentication token + headers = {"Authorization": f"{self.config.pat_token}"} + + # get the contents of the repository + response = requests.get(repo_content_url, headers=headers) + contents = response.json() + + markdown_files = [] + for item in contents["tree"]: + # Find all markdown files in the repository + if item["type"] == "blob" and item["path"].endswith(".md"): + # Get text from each markdown file + file_content_url = f'{self.repo_url}/contents/{item["path"]}' + headers["Accept"] = "application/vnd.github.v3.raw" + markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8") + markdown_files += [{"content": markdown_file_contents, "path": item["path"]}] + + return markdown_files @staticmethod def extract_markdown_entries(markdown_files): @@ -84,6 +89,6 @@ class GithubToJsonl(TextToJsonl): entry_to_file_map = [] for doc in markdown_files: entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( - doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map + doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) From 31d17d0b22ce7cbc2337ee48dedd885c757f715e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 17 Jun 2023 02:50:58 -0700 Subject: [PATCH 02/10] Index commits message from repository with the github plugin --- src/khoj/processor/github/github_to_jsonl.py | 34 +++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index fa8eb8a1..bcadd09b 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -1,12 +1,13 @@ # Standard Packages import logging +from typing import Dict, List # External Packages import requests # Internal Packages from khoj.utils.helpers import timer -from khoj.utils.rawconfig import GithubContentConfig +from khoj.utils.rawconfig import Entry, GithubContentConfig from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -36,6 +37,9 @@ class GithubToJsonl(TextToJsonl): *GithubToJsonl.extract_markdown_entries(docs) ) + with timer("Extract commit messages from github repo", logger): + current_entries += self.convert_commits_to_entries(self.get_commits()) + with timer("Split entries by max token size supported by model", logger): current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256) @@ -83,6 +87,34 @@ class GithubToJsonl(TextToJsonl): return markdown_files + def get_commits(self) -> List[Dict]: + # Get commit messages from the repository using the Github API + headers = {"Authorization": f"{self.config.pat_token}"} + response = requests.get(f"{self.repo_url}/commits", headers=headers) + raw_commits = response.json() + + # Extract commit messages from the response + commits = [] + for commit in raw_commits: + commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] + + return commits + + def convert_commits_to_entries(self, commits) -> List[Entry]: + entries: List[Entry] = [] + for commit in commits: + compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}' + entries.append( + Entry( + compiled=compiled, + raw=f'### {commit["content"]}', + heading=commit["content"].split("\n")[0], + file=commit["path"], + ) + ) + + return entries + @staticmethod def extract_markdown_entries(markdown_files): entries = [] From 0c1c7583b58ec299bbc40b5f34d512bcacedf812 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 17 Jun 2023 03:38:16 -0700 Subject: [PATCH 03/10] Handle pagination, API rate limits. Get all commits from Github repo --- src/khoj/processor/github/github_to_jsonl.py | 35 ++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index bcadd09b..f862c951 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -1,5 +1,6 @@ # Standard Packages import logging +import time from typing import Dict, List # External Packages @@ -75,6 +76,13 @@ class GithubToJsonl(TextToJsonl): response = requests.get(repo_content_url, headers=headers) contents = response.json() + # If the rate limit is reached, wait for the reset time + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) + logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") + time.sleep(wait_time) + return self.get_markdown_files() + markdown_files = [] for item in contents["tree"]: # Find all markdown files in the repository @@ -90,13 +98,28 @@ class GithubToJsonl(TextToJsonl): def get_commits(self) -> List[Dict]: # Get commit messages from the repository using the Github API headers = {"Authorization": f"{self.config.pat_token}"} - response = requests.get(f"{self.repo_url}/commits", headers=headers) - raw_commits = response.json() - - # Extract commit messages from the response + commits_url = f"{self.repo_url}/commits" commits = [] - for commit in raw_commits: - commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] + + while commits_url is not None: + # Get the next page of commits + response = requests.get(commits_url, headers=headers) + + # If the rate limit is reached, wait for the reset time + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) + logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") + time.sleep(wait_time) + continue + + raw_commits = response.json() + + # Extract commit messages from the response + for commit in raw_commits: + commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] + + # Get the URL for the next page of commits, if any + commits_url = response.links.get("next", {}).get("url") return commits From 63ec84ad782cad2287f641895bcc16622444d8a5 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 17 Jun 2023 04:23:01 -0700 Subject: [PATCH 04/10] Store Github URL of Markdown files on Github in file jsonl param --- src/khoj/processor/github/github_to_jsonl.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index f862c951..d21f688b 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -87,11 +87,16 @@ class GithubToJsonl(TextToJsonl): for item in contents["tree"]: # Find all markdown files in the repository if item["type"] == "blob" and item["path"].endswith(".md"): + # Create URL for each markdown file on Github + url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}' + # Get text from each markdown file file_content_url = f'{self.repo_url}/contents/{item["path"]}' headers["Accept"] = "application/vnd.github.v3.raw" markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8") - markdown_files += [{"content": markdown_file_contents, "path": item["path"]}] + + # Add markdown file contents and URL to list + markdown_files += [{"content": markdown_file_contents, "path": url_path}] return markdown_files From 3f24e53b6e2fd60f88e7988bea3ee5a080cfe206 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 17 Jun 2023 04:26:40 -0700 Subject: [PATCH 05/10] Render URL as link in web interface if file param of result is a web link --- src/khoj/interface/web/index.html | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 388c0207..d9874072 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -34,6 +34,9 @@ function render_markdown(query, data) { var md = window.markdownit(); return md.render(data.map(function (item) { + lines = item.entry.split("\n") + if (item.additional.file.startsWith("http")) + return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}` return `${item.entry}` }).join("\n")); } From 10d4c38ce9e9bcb2dc72aea03f15fbfe91b877be Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:06:46 -0700 Subject: [PATCH 06/10] Extract Wait for rate limit reset logic into a function for reuse --- src/khoj/processor/github/github_to_jsonl.py | 33 +++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index d21f688b..f622b5e9 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -23,6 +23,16 @@ class GithubToJsonl(TextToJsonl): self.config = config self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}" + @staticmethod + def wait_for_rate_limit_reset(response, func, *args, **kwargs): + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) + logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") + time.sleep(wait_time) + return func(*args, **kwargs) + else: + return + def process(self, previous_entries=None): with timer("Download markdown files from github repo", logger): try: @@ -76,12 +86,10 @@ class GithubToJsonl(TextToJsonl): response = requests.get(repo_content_url, headers=headers) contents = response.json() - # If the rate limit is reached, wait for the reset time - if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": - wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) - logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") - time.sleep(wait_time) - return self.get_markdown_files() + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_markdown_files) + if result is not None: + return result markdown_files = [] for item in contents["tree"]: @@ -109,16 +117,13 @@ class GithubToJsonl(TextToJsonl): while commits_url is not None: # Get the next page of commits response = requests.get(commits_url, headers=headers) - - # If the rate limit is reached, wait for the reset time - if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": - wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) - logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") - time.sleep(wait_time) - continue - raw_commits = response.json() + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_commits) + if result is not None: + return result + # Extract commit messages from the response for commit in raw_commits: commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] From 9c70af960ca2d1c5dd357ef4a1dcf085833d1a2c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:08:57 -0700 Subject: [PATCH 07/10] Extract logic to get file content from Github into a separate method --- src/khoj/processor/github/github_to_jsonl.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index f622b5e9..f29bef2b 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -98,16 +98,23 @@ class GithubToJsonl(TextToJsonl): # Create URL for each markdown file on Github url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}' - # Get text from each markdown file - file_content_url = f'{self.repo_url}/contents/{item["path"]}' - headers["Accept"] = "application/vnd.github.v3.raw" - markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8") - # Add markdown file contents and URL to list - markdown_files += [{"content": markdown_file_contents, "path": url_path}] + markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}] return markdown_files + def get_file_contents(self, file_url): + # Get text from each markdown file + headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"} + response = requests.get(file_url, headers=headers) + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url) + if result is not None: + return result + + return response.content.decode("utf-8") + def get_commits(self) -> List[Dict]: # Get commit messages from the repository using the Github API headers = {"Authorization": f"{self.config.pat_token}"} From 87975e589aa562521e24cf55588e756bb2623b6b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:12:47 -0700 Subject: [PATCH 08/10] Fix passing auth token to Github API to increase rate limits by x85 - Previously wasn't prefixing "token" to PAT token in Auth header This resulted in the request being considered unauthenticated - Unauthenticated requests to Github API are limited to 60 requests/hour Authenticated requests to Github API are allowed 5000 requests/hour --- src/khoj/processor/github/github_to_jsonl.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index f29bef2b..d76f4979 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -77,12 +77,9 @@ class GithubToJsonl(TextToJsonl): return entries_with_ids def get_markdown_files(self): - # set the url to get the contents of the repository + # Get the contents of the repository repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}" - # set the headers to include the authentication token - headers = {"Authorization": f"{self.config.pat_token}"} - - # get the contents of the repository + headers = {"Authorization": f"token {self.config.pat_token}"} response = requests.get(repo_content_url, headers=headers) contents = response.json() @@ -91,6 +88,7 @@ class GithubToJsonl(TextToJsonl): if result is not None: return result + # Extract markdown files from the repository markdown_files = [] for item in contents["tree"]: # Find all markdown files in the repository @@ -105,7 +103,7 @@ class GithubToJsonl(TextToJsonl): def get_file_contents(self, file_url): # Get text from each markdown file - headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"} + headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"} response = requests.get(file_url, headers=headers) # Wait for rate limit reset if needed @@ -117,8 +115,8 @@ class GithubToJsonl(TextToJsonl): def get_commits(self) -> List[Dict]: # Get commit messages from the repository using the Github API - headers = {"Authorization": f"{self.config.pat_token}"} commits_url = f"{self.repo_url}/commits" + headers = {"Authorization": f"token {self.config.pat_token}"} commits = [] while commits_url is not None: From 6fdac2441652d1fb6bcb0cfda413a32ced7ba24f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:20:05 -0700 Subject: [PATCH 09/10] Set page size to 100 to reduce requests required to Github API to 1/3 - Default is 30. So number of paginated requests required to get all items (commits, files) will reduce by 67% - No need to increase page size for the get tree Github API request from `get_markdown_files' Get tree Github API doesn't support pagination and return 100K items in response. This should be way more than enough for our current use-cases --- src/khoj/processor/github/github_to_jsonl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index d76f4979..789d8259 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -117,11 +117,12 @@ class GithubToJsonl(TextToJsonl): # Get commit messages from the repository using the Github API commits_url = f"{self.repo_url}/commits" headers = {"Authorization": f"token {self.config.pat_token}"} + params = {"per_page": 100} commits = [] while commits_url is not None: # Get the next page of commits - response = requests.get(commits_url, headers=headers) + response = requests.get(commits_url, headers=headers, params=params) raw_commits = response.json() # Wait for rate limit reset if needed From e31a540a5efcb9235629fd63167054e8dd9c95c7 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:47:15 -0700 Subject: [PATCH 10/10] Get all md files recursively in repository by passing recursive param Previously the `get_markdown_files' method was only getting files at root of the repository Fix, improve logger messages in github to jsonl processor --- src/khoj/processor/github/github_to_jsonl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 789d8259..80d55f38 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -38,10 +38,10 @@ class GithubToJsonl(TextToJsonl): try: docs = self.get_markdown_files() except Exception as e: - logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}") + logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}") raise e - logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}") + logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}") with timer("Extract markdown entries from github repo", logger): current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( @@ -63,7 +63,7 @@ class GithubToJsonl(TextToJsonl): current_entries, previous_entries, key="compiled", logger=logger ) - with timer("Write markdown entries to JSONL file", logger): + with timer("Write github entries to JSONL file", logger): # Process Each Entry from All Notes Files entries = list(map(lambda entry: entry[1], entries_with_ids)) jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) @@ -80,7 +80,8 @@ class GithubToJsonl(TextToJsonl): # Get the contents of the repository repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}" headers = {"Authorization": f"token {self.config.pat_token}"} - response = requests.get(repo_content_url, headers=headers) + params = {"recursive": "true"} + response = requests.get(repo_content_url, headers=headers, params=params) contents = response.json() # Wait for rate limit reset if needed