diff --git a/pyproject.toml b/pyproject.toml index db152d29..f44849ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ dependencies = [ "aiohttp == 3.8.4", "langchain >= 0.0.187", "pypdf >= 3.9.0", - "llama-hub==0.0.3", + "requests >= 2.26.0", ] dynamic = ["version"] diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 388c0207..d9874072 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -34,6 +34,9 @@ function render_markdown(query, data) { var md = window.markdownit(); return md.render(data.map(function (item) { + lines = item.entry.split("\n") + if (item.additional.file.startsWith("http")) + return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}` return `${item.entry}` }).join("\n")); } diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index b989c12f..80d55f38 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -1,12 +1,18 @@ +# Standard Packages import logging -from llama_index import download_loader +import time +from typing import Dict, List + +# External Packages +import requests + +# Internal Packages from khoj.utils.helpers import timer -from khoj.utils.rawconfig import GithubContentConfig -from llama_hub.github_repo import GithubRepositoryReader, GithubClient +from khoj.utils.rawconfig import Entry, GithubContentConfig from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data -from khoj.utils import state + logger = logging.getLogger(__name__) @@ -14,31 +20,37 @@ logger = logging.getLogger(__name__) class GithubToJsonl(TextToJsonl): def __init__(self, config: GithubContentConfig): super().__init__(config) - download_loader("GithubRepositoryReader") + self.config = config + self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}" + + @staticmethod + def wait_for_rate_limit_reset(response, func, *args, **kwargs): + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) + logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") + time.sleep(wait_time) + return func(*args, **kwargs) + else: + return def process(self, previous_entries=None): - try: - self.initialize() - except Exception as e: - logger.error( - f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}" - ) - raise e - - with timer("Download github repo", logger): + with timer("Download markdown files from github repo", logger): try: docs = self.get_markdown_files() except Exception as e: - logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}") + logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}") raise e - logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}") + logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}") with timer("Extract markdown entries from github repo", logger): current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( *GithubToJsonl.extract_markdown_entries(docs) ) + with timer("Extract commit messages from github repo", logger): + current_entries += self.convert_commits_to_entries(self.get_commits()) + with timer("Split entries by max token size supported by model", logger): current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256) @@ -51,7 +63,7 @@ class GithubToJsonl(TextToJsonl): current_entries, previous_entries, key="compiled", logger=logger ) - with timer("Write markdown entries to JSONL file", logger): + with timer("Write github entries to JSONL file", logger): # Process Each Entry from All Notes Files entries = list(map(lambda entry: entry[1], entries_with_ids)) jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) @@ -64,19 +76,84 @@ class GithubToJsonl(TextToJsonl): return entries_with_ids - def initialize(self): - logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}") - github_client = GithubClient(self.config.pat_token) - self.loader = GithubRepositoryReader( - github_client, - owner=self.config.repo_owner, - repo=self.config.repo_name, - filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE), - verbose=state.verbose > 1, - ) - def get_markdown_files(self): - return self.loader.load_data(branch=self.config.repo_branch) + # Get the contents of the repository + repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}" + headers = {"Authorization": f"token {self.config.pat_token}"} + params = {"recursive": "true"} + response = requests.get(repo_content_url, headers=headers, params=params) + contents = response.json() + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_markdown_files) + if result is not None: + return result + + # Extract markdown files from the repository + markdown_files = [] + for item in contents["tree"]: + # Find all markdown files in the repository + if item["type"] == "blob" and item["path"].endswith(".md"): + # Create URL for each markdown file on Github + url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}' + + # Add markdown file contents and URL to list + markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}] + + return markdown_files + + def get_file_contents(self, file_url): + # Get text from each markdown file + headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"} + response = requests.get(file_url, headers=headers) + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url) + if result is not None: + return result + + return response.content.decode("utf-8") + + def get_commits(self) -> List[Dict]: + # Get commit messages from the repository using the Github API + commits_url = f"{self.repo_url}/commits" + headers = {"Authorization": f"token {self.config.pat_token}"} + params = {"per_page": 100} + commits = [] + + while commits_url is not None: + # Get the next page of commits + response = requests.get(commits_url, headers=headers, params=params) + raw_commits = response.json() + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_commits) + if result is not None: + return result + + # Extract commit messages from the response + for commit in raw_commits: + commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] + + # Get the URL for the next page of commits, if any + commits_url = response.links.get("next", {}).get("url") + + return commits + + def convert_commits_to_entries(self, commits) -> List[Entry]: + entries: List[Entry] = [] + for commit in commits: + compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}' + entries.append( + Entry( + compiled=compiled, + raw=f'### {commit["content"]}', + heading=commit["content"].split("\n")[0], + file=commit["path"], + ) + ) + + return entries @staticmethod def extract_markdown_entries(markdown_files): @@ -84,6 +161,6 @@ class GithubToJsonl(TextToJsonl): entry_to_file_map = [] for doc in markdown_files: entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( - doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map + doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map)