mirror of
https://github.com/khoaliber/khoj.git
synced 2026-03-05 13:21:18 +00:00
Stop indexing commits, issues and issue comments in Github indexer
Normal indexing quickly Github hits rate limits. Purpose of exposing Github indexer is for indexing content like notes, code and other knowledge base in a repo. The current indexer doesn't scale to index metadata given Github's rate limits, so remove it instead of giving a degraded experience of partially indexed repos
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
@@ -83,15 +82,6 @@ class GithubToEntries(TextToEntries):
|
||||
*GithubToEntries.extract_org_entries(org_files)
|
||||
)
|
||||
|
||||
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
||||
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
||||
|
||||
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
||||
issue_entries = GithubToEntries.convert_issues_to_entries(
|
||||
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
|
||||
)
|
||||
current_entries += issue_entries
|
||||
|
||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
@@ -170,121 +160,6 @@ class GithubToEntries(TextToEntries):
|
||||
|
||||
return content
|
||||
|
||||
def get_commits(self, repo_url: str) -> List[Dict]:
|
||||
return self._get_commits(f"{repo_url}/commits")
|
||||
|
||||
def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]:
|
||||
# Get commit messages from the repository using the Github API
|
||||
params = {"per_page": 100}
|
||||
commits = []
|
||||
|
||||
while commits_url is not None:
|
||||
# Get the next page of commits
|
||||
response = self.session.get(commits_url, params=params, stream=True)
|
||||
|
||||
# Read the streamed response into a JSON object
|
||||
content = response.json()
|
||||
|
||||
# Wait for rate limit reset if needed
|
||||
result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Extract commit messages from the response
|
||||
for commit in content:
|
||||
if "commit" in commit and "message" in commit["commit"] and "html_url" in commit:
|
||||
commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
|
||||
else:
|
||||
logger.debug(f"Skipping commit with missing properties: {commit}")
|
||||
|
||||
# Get the URL for the next page of commits, if any
|
||||
commits_url = response.links.get("next", {}).get("url")
|
||||
|
||||
return commits
|
||||
|
||||
def get_issues(self, repo_url: str) -> List[Dict]:
|
||||
return self._get_issues(f"{repo_url}/issues")
|
||||
|
||||
def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]:
|
||||
issues = []
|
||||
per_page = 100
|
||||
params = {"per_page": per_page, "state": "all"}
|
||||
|
||||
while issues_url is not None:
|
||||
# Get the next page of issues
|
||||
response = self.session.get(issues_url, params=params, stream=True) # type: ignore
|
||||
raw_issues = response.json()
|
||||
|
||||
# Wait for rate limit reset if needed
|
||||
result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
for issue in raw_issues:
|
||||
username = issue["user"]["login"]
|
||||
user_url = f"[{username}]({issue['user']['html_url']})"
|
||||
issue_content = {
|
||||
"content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}",
|
||||
"path": issue["html_url"],
|
||||
}
|
||||
issue_content["created_at"] = {issue["created_at"]}
|
||||
if issue["comments"] > 0:
|
||||
issue_content["comments"] = self.get_comments(issue["comments_url"])
|
||||
issues += [issue_content]
|
||||
|
||||
issues_url = response.links.get("next", {}).get("url")
|
||||
|
||||
return issues
|
||||
|
||||
def get_comments(self, comments_url: Union[str, None]) -> List[Dict]:
|
||||
# By default, the number of results per page is 30. We'll keep it as-is for now.
|
||||
comments = []
|
||||
per_page = 100
|
||||
params = {"per_page": per_page}
|
||||
|
||||
while comments_url is not None:
|
||||
# Get the next page of comments
|
||||
response = self.session.get(comments_url, params=params, stream=True)
|
||||
raw_comments = response.json()
|
||||
|
||||
# Wait for rate limit reset if needed
|
||||
result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
for comment in raw_comments:
|
||||
created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M")
|
||||
commenter = comment["user"]["login"]
|
||||
commenter_url = comment["user"]["html_url"]
|
||||
comment_url = comment["html_url"]
|
||||
comment_url_link = f"[{created_at}]({comment_url})"
|
||||
avatar_url = comment["user"]["avatar_url"]
|
||||
avatar = f""
|
||||
comments += [
|
||||
{
|
||||
"content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}"
|
||||
}
|
||||
]
|
||||
|
||||
comments_url = response.links.get("next", {}).get("url")
|
||||
|
||||
return comments
|
||||
|
||||
def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
|
||||
entries: List[Entry] = []
|
||||
for commit in commits:
|
||||
compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}'
|
||||
entries.append(
|
||||
Entry(
|
||||
compiled=compiled,
|
||||
raw=f'### {commit["content"]}',
|
||||
heading=commit["content"].split("\n")[0],
|
||||
file=commit["path"],
|
||||
)
|
||||
)
|
||||
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def extract_markdown_entries(markdown_files):
|
||||
entries = []
|
||||
@@ -305,32 +180,3 @@ class GithubToEntries(TextToEntries):
|
||||
doc["content"], doc["path"], entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def extract_github_issues(issues):
|
||||
entries = []
|
||||
entry_to_file_map = {}
|
||||
for issue in issues:
|
||||
content = issue["content"]
|
||||
if "comments" in issue:
|
||||
for comment in issue["comments"]:
|
||||
content += "\n\n" + comment["content"]
|
||||
entries.append(content)
|
||||
entry_to_file_map[content] = {"path": issue["path"]}
|
||||
return entries, entry_to_file_map
|
||||
|
||||
@staticmethod
|
||||
def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]:
|
||||
entries = []
|
||||
for entry in parsed_entries:
|
||||
entry_file_name = entry_to_metadata_map[entry]["path"]
|
||||
entries.append(
|
||||
Entry(
|
||||
compiled=entry,
|
||||
raw=entry,
|
||||
heading=entry.split("\n")[0],
|
||||
file=entry_file_name,
|
||||
)
|
||||
)
|
||||
|
||||
return entries
|
||||
|
||||
Reference in New Issue
Block a user