From 9c70af960ca2d1c5dd357ef4a1dcf085833d1a2c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:08:57 -0700 Subject: [PATCH] Extract logic to get file content from Github into a separate method --- src/khoj/processor/github/github_to_jsonl.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index f622b5e9..f29bef2b 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -98,16 +98,23 @@ class GithubToJsonl(TextToJsonl): # Create URL for each markdown file on Github url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}' - # Get text from each markdown file - file_content_url = f'{self.repo_url}/contents/{item["path"]}' - headers["Accept"] = "application/vnd.github.v3.raw" - markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8") - # Add markdown file contents and URL to list - markdown_files += [{"content": markdown_file_contents, "path": url_path}] + markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}] return markdown_files + def get_file_contents(self, file_url): + # Get text from each markdown file + headers = {"Authorization": f"{self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"} + response = requests.get(file_url, headers=headers) + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url) + if result is not None: + return result + + return response.content.decode("utf-8") + def get_commits(self) -> List[Dict]: # Get commit messages from the repository using the Github API headers = {"Authorization": f"{self.config.pat_token}"}