From d5c9b5cb32a0fe4186c360dad4bb0378cf359e13 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Mon, 8 Apr 2024 23:05:13 +0530
Subject: [PATCH] Stop indexing commits, issues and issue comments in Github
 indexer

Normal indexing quickly Github hits rate limits. Purpose of exposing
Github indexer is for indexing content like notes, code and other
knowledge base in a repo.

The current indexer doesn't scale to index metadata given Github's
rate limits, so remove it instead of giving a degraded experience of
partially indexed repos
---
 .../content/github/github_to_entries.py       | 156 +-----------------
 1 file changed, 1 insertion(+), 155 deletions(-)

diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py
index a3b4cdbc..53b1b98b 100644
--- a/src/khoj/processor/content/github/github_to_entries.py
+++ b/src/khoj/processor/content/github/github_to_entries.py
@@ -1,7 +1,6 @@
 import logging
 import time
-from datetime import datetime
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, List, Tuple
 
 import requests
 
@@ -83,15 +82,6 @@ class GithubToEntries(TextToEntries):
                 *GithubToEntries.extract_org_entries(org_files)
             )
 
-        with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
-            current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
-
-        with timer(f"Extract issues from github repo {repo_shorthand}", logger):
-            issue_entries = GithubToEntries.convert_issues_to_entries(
-                *GithubToEntries.extract_github_issues(self.get_issues(repo_url))
-            )
-            current_entries += issue_entries
-
         with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
             current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
 
@@ -170,121 +160,6 @@ class GithubToEntries(TextToEntries):
 
         return content
 
-    def get_commits(self, repo_url: str) -> List[Dict]:
-        return self._get_commits(f"{repo_url}/commits")
-
-    def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]:
-        # Get commit messages from the repository using the Github API
-        params = {"per_page": 100}
-        commits = []
-
-        while commits_url is not None:
-            # Get the next page of commits
-            response = self.session.get(commits_url, params=params, stream=True)
-
-            # Read the streamed response into a JSON object
-            content = response.json()
-
-            # Wait for rate limit reset if needed
-            result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url)
-            if result is not None:
-                return result
-
-            # Extract commit messages from the response
-            for commit in content:
-                if "commit" in commit and "message" in commit["commit"] and "html_url" in commit:
-                    commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
-                else:
-                    logger.debug(f"Skipping commit with missing properties: {commit}")
-
-            # Get the URL for the next page of commits, if any
-            commits_url = response.links.get("next", {}).get("url")
-
-        return commits
-
-    def get_issues(self, repo_url: str) -> List[Dict]:
-        return self._get_issues(f"{repo_url}/issues")
-
-    def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]:
-        issues = []
-        per_page = 100
-        params = {"per_page": per_page, "state": "all"}
-
-        while issues_url is not None:
-            # Get the next page of issues
-            response = self.session.get(issues_url, params=params, stream=True)  # type: ignore
-            raw_issues = response.json()
-
-            # Wait for rate limit reset if needed
-            result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url)
-            if result is not None:
-                return result
-
-            for issue in raw_issues:
-                username = issue["user"]["login"]
-                user_url = f"[{username}]({issue['user']['html_url']})"
-                issue_content = {
-                    "content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}",
-                    "path": issue["html_url"],
-                }
-                issue_content["created_at"] = {issue["created_at"]}
-                if issue["comments"] > 0:
-                    issue_content["comments"] = self.get_comments(issue["comments_url"])
-                issues += [issue_content]
-
-            issues_url = response.links.get("next", {}).get("url")
-
-        return issues
-
-    def get_comments(self, comments_url: Union[str, None]) -> List[Dict]:
-        # By default, the number of results per page is 30. We'll keep it as-is for now.
-        comments = []
-        per_page = 100
-        params = {"per_page": per_page}
-
-        while comments_url is not None:
-            # Get the next page of comments
-            response = self.session.get(comments_url, params=params, stream=True)
-            raw_comments = response.json()
-
-            # Wait for rate limit reset if needed
-            result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url)
-            if result is not None:
-                return result
-
-            for comment in raw_comments:
-                created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M")
-                commenter = comment["user"]["login"]
-                commenter_url = comment["user"]["html_url"]
-                comment_url = comment["html_url"]
-                comment_url_link = f"[{created_at}]({comment_url})"
-                avatar_url = comment["user"]["avatar_url"]
-                avatar = f"![{commenter}]({avatar_url})"
-                comments += [
-                    {
-                        "content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}"
-                    }
-                ]
-
-            comments_url = response.links.get("next", {}).get("url")
-
-        return comments
-
-    def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
-        entries: List[Entry] = []
-        for commit in commits:
-            compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}'
-            entries.append(
-                Entry(
-                    compiled=compiled,
-                    raw=f'### {commit["content"]}',
-                    heading=commit["content"].split("\n")[0],
-                    file=commit["path"],
-                )
-            )
-
-        return entries
-
     @staticmethod
     def extract_markdown_entries(markdown_files):
         entries = []
@@ -305,32 +180,3 @@ class GithubToEntries(TextToEntries):
                 doc["content"], doc["path"], entries, entry_to_file_map
             )
         return entries, dict(entry_to_file_map)
-
-    @staticmethod
-    def extract_github_issues(issues):
-        entries = []
-        entry_to_file_map = {}
-        for issue in issues:
-            content = issue["content"]
-            if "comments" in issue:
-                for comment in issue["comments"]:
-                    content += "\n\n" + comment["content"]
-            entries.append(content)
-            entry_to_file_map[content] = {"path": issue["path"]}
-        return entries, entry_to_file_map
-
-    @staticmethod
-    def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]:
-        entries = []
-        for entry in parsed_entries:
-            entry_file_name = entry_to_metadata_map[entry]["path"]
-            entries.append(
-                Entry(
-                    compiled=entry,
-                    raw=entry,
-                    heading=entry.split("\n")[0],
-                    file=entry_file_name,
-                )
-            )
-
-        return entries