From d5c9b5cb32a0fe4186c360dad4bb0378cf359e13 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 8 Apr 2024 23:05:13 +0530 Subject: [PATCH] Stop indexing commits, issues and issue comments in Github indexer Normal indexing quickly Github hits rate limits. Purpose of exposing Github indexer is for indexing content like notes, code and other knowledge base in a repo. The current indexer doesn't scale to index metadata given Github's rate limits, so remove it instead of giving a degraded experience of partially indexed repos --- .../content/github/github_to_entries.py | 156 +----------------- 1 file changed, 1 insertion(+), 155 deletions(-) diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index a3b4cdbc..53b1b98b 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -1,7 +1,6 @@ import logging import time -from datetime import datetime -from typing import Any, Dict, List, Tuple, Union +from typing import Any, List, Tuple import requests @@ -83,15 +82,6 @@ class GithubToEntries(TextToEntries): *GithubToEntries.extract_org_entries(org_files) ) - with timer(f"Extract commit messages from github repo {repo_shorthand}", logger): - current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) - - with timer(f"Extract issues from github repo {repo_shorthand}", logger): - issue_entries = GithubToEntries.convert_issues_to_entries( - *GithubToEntries.extract_github_issues(self.get_issues(repo_url)) - ) - current_entries += issue_entries - with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) @@ -170,121 +160,6 @@ class GithubToEntries(TextToEntries): return content - def get_commits(self, repo_url: str) -> List[Dict]: - return self._get_commits(f"{repo_url}/commits") - - def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]: - # Get commit messages from the repository using the Github API - params = {"per_page": 100} - commits = [] - - while commits_url is not None: - # Get the next page of commits - response = self.session.get(commits_url, params=params, stream=True) - - # Read the streamed response into a JSON object - content = response.json() - - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url) - if result is not None: - return result - - # Extract commit messages from the response - for commit in content: - if "commit" in commit and "message" in commit["commit"] and "html_url" in commit: - commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] - else: - logger.debug(f"Skipping commit with missing properties: {commit}") - - # Get the URL for the next page of commits, if any - commits_url = response.links.get("next", {}).get("url") - - return commits - - def get_issues(self, repo_url: str) -> List[Dict]: - return self._get_issues(f"{repo_url}/issues") - - def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]: - issues = [] - per_page = 100 - params = {"per_page": per_page, "state": "all"} - - while issues_url is not None: - # Get the next page of issues - response = self.session.get(issues_url, params=params, stream=True) # type: ignore - raw_issues = response.json() - - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url) - if result is not None: - return result - - for issue in raw_issues: - username = issue["user"]["login"] - user_url = f"[{username}]({issue['user']['html_url']})" - issue_content = { - "content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}", - "path": issue["html_url"], - } - issue_content["created_at"] = {issue["created_at"]} - if issue["comments"] > 0: - issue_content["comments"] = self.get_comments(issue["comments_url"]) - issues += [issue_content] - - issues_url = response.links.get("next", {}).get("url") - - return issues - - def get_comments(self, comments_url: Union[str, None]) -> List[Dict]: - # By default, the number of results per page is 30. We'll keep it as-is for now. - comments = [] - per_page = 100 - params = {"per_page": per_page} - - while comments_url is not None: - # Get the next page of comments - response = self.session.get(comments_url, params=params, stream=True) - raw_comments = response.json() - - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url) - if result is not None: - return result - - for comment in raw_comments: - created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M") - commenter = comment["user"]["login"] - commenter_url = comment["user"]["html_url"] - comment_url = comment["html_url"] - comment_url_link = f"[{created_at}]({comment_url})" - avatar_url = comment["user"]["avatar_url"] - avatar = f"![{commenter}]({avatar_url})" - comments += [ - { - "content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}" - } - ] - - comments_url = response.links.get("next", {}).get("url") - - return comments - - def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]: - entries: List[Entry] = [] - for commit in commits: - compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}' - entries.append( - Entry( - compiled=compiled, - raw=f'### {commit["content"]}', - heading=commit["content"].split("\n")[0], - file=commit["path"], - ) - ) - - return entries - @staticmethod def extract_markdown_entries(markdown_files): entries = [] @@ -305,32 +180,3 @@ class GithubToEntries(TextToEntries): doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) - - @staticmethod - def extract_github_issues(issues): - entries = [] - entry_to_file_map = {} - for issue in issues: - content = issue["content"] - if "comments" in issue: - for comment in issue["comments"]: - content += "\n\n" + comment["content"] - entries.append(content) - entry_to_file_map[content] = {"path": issue["path"]} - return entries, entry_to_file_map - - @staticmethod - def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]: - entries = [] - for entry in parsed_entries: - entry_file_name = entry_to_metadata_map[entry]["path"] - entries.append( - Entry( - compiled=entry, - raw=entry, - heading=entry.split("\n")[0], - file=entry_file_name, - ) - ) - - return entries