diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index a3b4cdbc..53b1b98b 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -1,7 +1,6 @@ import logging import time -from datetime import datetime -from typing import Any, Dict, List, Tuple, Union +from typing import Any, List, Tuple import requests @@ -83,15 +82,6 @@ class GithubToEntries(TextToEntries): *GithubToEntries.extract_org_entries(org_files) ) - with timer(f"Extract commit messages from github repo {repo_shorthand}", logger): - current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) - - with timer(f"Extract issues from github repo {repo_shorthand}", logger): - issue_entries = GithubToEntries.convert_issues_to_entries( - *GithubToEntries.extract_github_issues(self.get_issues(repo_url)) - ) - current_entries += issue_entries - with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) @@ -170,121 +160,6 @@ class GithubToEntries(TextToEntries): return content - def get_commits(self, repo_url: str) -> List[Dict]: - return self._get_commits(f"{repo_url}/commits") - - def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]: - # Get commit messages from the repository using the Github API - params = {"per_page": 100} - commits = [] - - while commits_url is not None: - # Get the next page of commits - response = self.session.get(commits_url, params=params, stream=True) - - # Read the streamed response into a JSON object - content = response.json() - - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url) - if result is not None: - return result - - # Extract commit messages from the response - for commit in content: - if "commit" in commit and "message" in commit["commit"] and "html_url" in commit: - commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] - else: - logger.debug(f"Skipping commit with missing properties: {commit}") - - # Get the URL for the next page of commits, if any - commits_url = response.links.get("next", {}).get("url") - - return commits - - def get_issues(self, repo_url: str) -> List[Dict]: - return self._get_issues(f"{repo_url}/issues") - - def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]: - issues = [] - per_page = 100 - params = {"per_page": per_page, "state": "all"} - - while issues_url is not None: - # Get the next page of issues - response = self.session.get(issues_url, params=params, stream=True) # type: ignore - raw_issues = response.json() - - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url) - if result is not None: - return result - - for issue in raw_issues: - username = issue["user"]["login"] - user_url = f"[{username}]({issue['user']['html_url']})" - issue_content = { - "content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}", - "path": issue["html_url"], - } - issue_content["created_at"] = {issue["created_at"]} - if issue["comments"] > 0: - issue_content["comments"] = self.get_comments(issue["comments_url"]) - issues += [issue_content] - - issues_url = response.links.get("next", {}).get("url") - - return issues - - def get_comments(self, comments_url: Union[str, None]) -> List[Dict]: - # By default, the number of results per page is 30. We'll keep it as-is for now. - comments = [] - per_page = 100 - params = {"per_page": per_page} - - while comments_url is not None: - # Get the next page of comments - response = self.session.get(comments_url, params=params, stream=True) - raw_comments = response.json() - - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url) - if result is not None: - return result - - for comment in raw_comments: - created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M") - commenter = comment["user"]["login"] - commenter_url = comment["user"]["html_url"] - comment_url = comment["html_url"] - comment_url_link = f"[{created_at}]({comment_url})" - avatar_url = comment["user"]["avatar_url"] - avatar = f"![{commenter}]({avatar_url})" - comments += [ - { - "content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}" - } - ] - - comments_url = response.links.get("next", {}).get("url") - - return comments - - def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]: - entries: List[Entry] = [] - for commit in commits: - compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}' - entries.append( - Entry( - compiled=compiled, - raw=f'### {commit["content"]}', - heading=commit["content"].split("\n")[0], - file=commit["path"], - ) - ) - - return entries - @staticmethod def extract_markdown_entries(markdown_files): entries = [] @@ -305,32 +180,3 @@ class GithubToEntries(TextToEntries): doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) - - @staticmethod - def extract_github_issues(issues): - entries = [] - entry_to_file_map = {} - for issue in issues: - content = issue["content"] - if "comments" in issue: - for comment in issue["comments"]: - content += "\n\n" + comment["content"] - entries.append(content) - entry_to_file_map[content] = {"path": issue["path"]} - return entries, entry_to_file_map - - @staticmethod - def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]: - entries = [] - for entry in parsed_entries: - entry_file_name = entry_to_metadata_map[entry]["path"] - entries.append( - Entry( - compiled=entry, - raw=entry, - heading=entry.split("\n")[0], - file=entry_file_name, - ) - ) - - return entries