From 10d4c38ce9e9bcb2dc72aea03f15fbfe91b877be Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:06:46 -0700 Subject: [PATCH] Extract Wait for rate limit reset logic into a function for reuse --- src/khoj/processor/github/github_to_jsonl.py | 33 +++++++++++--------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index d21f688b..f622b5e9 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -23,6 +23,16 @@ class GithubToJsonl(TextToJsonl): self.config = config self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}" + @staticmethod + def wait_for_rate_limit_reset(response, func, *args, **kwargs): + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) + logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") + time.sleep(wait_time) + return func(*args, **kwargs) + else: + return + def process(self, previous_entries=None): with timer("Download markdown files from github repo", logger): try: @@ -76,12 +86,10 @@ class GithubToJsonl(TextToJsonl): response = requests.get(repo_content_url, headers=headers) contents = response.json() - # If the rate limit is reached, wait for the reset time - if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": - wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) - logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") - time.sleep(wait_time) - return self.get_markdown_files() + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_markdown_files) + if result is not None: + return result markdown_files = [] for item in contents["tree"]: @@ -109,16 +117,13 @@ class GithubToJsonl(TextToJsonl): while commits_url is not None: # Get the next page of commits response = requests.get(commits_url, headers=headers) - - # If the rate limit is reached, wait for the reset time - if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": - wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time()) - logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds") - time.sleep(wait_time) - continue - raw_commits = response.json() + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_commits) + if result is not None: + return result + # Extract commit messages from the response for commit in raw_commits: commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]