Stop indexing commits, issues and issue comments in Github indexer

Normal indexing quickly Github hits rate limits. Purpose of exposing
Github indexer is for indexing content like notes, code and other
knowledge base in a repo.

The current indexer doesn't scale to index metadata given Github's
rate limits, so remove it instead of giving a degraded experience of
partially indexed repos
This commit is contained in:
Debanjum Singh Solanky
2024-04-08 23:05:13 +05:30
parent 7ff1bd9f8b
commit d5c9b5cb32

View File

@@ -1,7 +1,6 @@
import logging
import time
from datetime import datetime
from typing import Any, Dict, List, Tuple, Union
from typing import Any, List, Tuple
import requests
@@ -83,15 +82,6 @@ class GithubToEntries(TextToEntries):
*GithubToEntries.extract_org_entries(org_files)
)
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
issue_entries = GithubToEntries.convert_issues_to_entries(
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
)
current_entries += issue_entries
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
@@ -170,121 +160,6 @@ class GithubToEntries(TextToEntries):
return content
def get_commits(self, repo_url: str) -> List[Dict]:
return self._get_commits(f"{repo_url}/commits")
def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]:
# Get commit messages from the repository using the Github API
params = {"per_page": 100}
commits = []
while commits_url is not None:
# Get the next page of commits
response = self.session.get(commits_url, params=params, stream=True)
# Read the streamed response into a JSON object
content = response.json()
# Wait for rate limit reset if needed
result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url)
if result is not None:
return result
# Extract commit messages from the response
for commit in content:
if "commit" in commit and "message" in commit["commit"] and "html_url" in commit:
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
else:
logger.debug(f"Skipping commit with missing properties: {commit}")
# Get the URL for the next page of commits, if any
commits_url = response.links.get("next", {}).get("url")
return commits
def get_issues(self, repo_url: str) -> List[Dict]:
return self._get_issues(f"{repo_url}/issues")
def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]:
issues = []
per_page = 100
params = {"per_page": per_page, "state": "all"}
while issues_url is not None:
# Get the next page of issues
response = self.session.get(issues_url, params=params, stream=True) # type: ignore
raw_issues = response.json()
# Wait for rate limit reset if needed
result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url)
if result is not None:
return result
for issue in raw_issues:
username = issue["user"]["login"]
user_url = f"[{username}]({issue['user']['html_url']})"
issue_content = {
"content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}",
"path": issue["html_url"],
}
issue_content["created_at"] = {issue["created_at"]}
if issue["comments"] > 0:
issue_content["comments"] = self.get_comments(issue["comments_url"])
issues += [issue_content]
issues_url = response.links.get("next", {}).get("url")
return issues
def get_comments(self, comments_url: Union[str, None]) -> List[Dict]:
# By default, the number of results per page is 30. We'll keep it as-is for now.
comments = []
per_page = 100
params = {"per_page": per_page}
while comments_url is not None:
# Get the next page of comments
response = self.session.get(comments_url, params=params, stream=True)
raw_comments = response.json()
# Wait for rate limit reset if needed
result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url)
if result is not None:
return result
for comment in raw_comments:
created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M")
commenter = comment["user"]["login"]
commenter_url = comment["user"]["html_url"]
comment_url = comment["html_url"]
comment_url_link = f"[{created_at}]({comment_url})"
avatar_url = comment["user"]["avatar_url"]
avatar = f"![{commenter}]({avatar_url})"
comments += [
{
"content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}"
}
]
comments_url = response.links.get("next", {}).get("url")
return comments
def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
entries: List[Entry] = []
for commit in commits:
compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}'
entries.append(
Entry(
compiled=compiled,
raw=f'### {commit["content"]}',
heading=commit["content"].split("\n")[0],
file=commit["path"],
)
)
return entries
@staticmethod
def extract_markdown_entries(markdown_files):
entries = []
@@ -305,32 +180,3 @@ class GithubToEntries(TextToEntries):
doc["content"], doc["path"], entries, entry_to_file_map
)
return entries, dict(entry_to_file_map)
@staticmethod
def extract_github_issues(issues):
entries = []
entry_to_file_map = {}
for issue in issues:
content = issue["content"]
if "comments" in issue:
for comment in issue["comments"]:
content += "\n\n" + comment["content"]
entries.append(content)
entry_to_file_map[content] = {"path": issue["path"]}
return entries, entry_to_file_map
@staticmethod
def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]:
entries = []
for entry in parsed_entries:
entry_file_name = entry_to_metadata_map[entry]["path"]
entries.append(
Entry(
compiled=entry,
raw=entry,
heading=entry.split("\n")[0],
file=entry_file_name,
)
)
return entries