Use Github Rest API to index Markdown files in Github Repository

The Llama_Hub Github plugin is fairly limited.

The Github Rest API is well supported and can easily be extended to
index commit messages, issues, discussions, PRs etc.
This commit is contained in:
Debanjum Singh Solanky
2023-06-17 01:39:57 -07:00
parent 9f00a366ab
commit c29c141a7e
2 changed files with 32 additions and 27 deletions

View File

@@ -56,7 +56,7 @@ dependencies = [
"aiohttp == 3.8.4", "aiohttp == 3.8.4",
"langchain >= 0.0.187", "langchain >= 0.0.187",
"pypdf >= 3.9.0", "pypdf >= 3.9.0",
"llama-hub==0.0.3", "requests >= 2.26.0",
] ]
dynamic = ["version"] dynamic = ["version"]

View File

@@ -1,12 +1,16 @@
# Standard Packages
import logging import logging
from llama_index import download_loader
# External Packages
import requests
# Internal Packages
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.rawconfig import GithubContentConfig from khoj.utils.rawconfig import GithubContentConfig
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.text_to_jsonl import TextToJsonl from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils import state
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -14,18 +18,11 @@ logger = logging.getLogger(__name__)
class GithubToJsonl(TextToJsonl): class GithubToJsonl(TextToJsonl):
def __init__(self, config: GithubContentConfig): def __init__(self, config: GithubContentConfig):
super().__init__(config) super().__init__(config)
download_loader("GithubRepositoryReader") self.config = config
self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
def process(self, previous_entries=None): def process(self, previous_entries=None):
try: with timer("Download markdown files from github repo", logger):
self.initialize()
except Exception as e:
logger.error(
f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
)
raise e
with timer("Download github repo", logger):
try: try:
docs = self.get_markdown_files() docs = self.get_markdown_files()
except Exception as e: except Exception as e:
@@ -64,19 +61,27 @@ class GithubToJsonl(TextToJsonl):
return entries_with_ids return entries_with_ids
def initialize(self):
logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
github_client = GithubClient(self.config.pat_token)
self.loader = GithubRepositoryReader(
github_client,
owner=self.config.repo_owner,
repo=self.config.repo_name,
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
verbose=state.verbose > 1,
)
def get_markdown_files(self): def get_markdown_files(self):
return self.loader.load_data(branch=self.config.repo_branch) # set the url to get the contents of the repository
repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
# set the headers to include the authentication token
headers = {"Authorization": f"{self.config.pat_token}"}
# get the contents of the repository
response = requests.get(repo_content_url, headers=headers)
contents = response.json()
markdown_files = []
for item in contents["tree"]:
# Find all markdown files in the repository
if item["type"] == "blob" and item["path"].endswith(".md"):
# Get text from each markdown file
file_content_url = f'{self.repo_url}/contents/{item["path"]}'
headers["Accept"] = "application/vnd.github.v3.raw"
markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
return markdown_files
@staticmethod @staticmethod
def extract_markdown_entries(markdown_files): def extract_markdown_entries(markdown_files):
@@ -84,6 +89,6 @@ class GithubToJsonl(TextToJsonl):
entry_to_file_map = [] entry_to_file_map = []
for doc in markdown_files: for doc in markdown_files:
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map doc["content"], doc["path"], entries, entry_to_file_map
) )
return entries, dict(entry_to_file_map) return entries, dict(entry_to_file_map)