From 079f409238079b8133922cbd6744b2d9811a802a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Mon, 8 Apr 2024 23:22:45 +0530 Subject: [PATCH] Skip indexing Github repo on hitting Github API rate limit Sleep until rate limit passed is too expensive, as it keeps a app worker occupied. Ideally we should schedule job to contine after rate limit wait time has passed. But this can only be added once we support jobs scheduling. --- .../content/github/github_to_entries.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/khoj/processor/content/github/github_to_entries.py b/src/khoj/processor/content/github/github_to_entries.py index 53b1b98b..d6fd1a91 100644 --- a/src/khoj/processor/content/github/github_to_entries.py +++ b/src/khoj/processor/content/github/github_to_entries.py @@ -1,6 +1,6 @@ import logging import time -from typing import Any, List, Tuple +from typing import Any, Dict, List, Tuple import requests @@ -63,7 +63,9 @@ class GithubToEntries(TextToEntries): logger.info(f"Processing github repo {repo_shorthand}") with timer("Download markdown files from github repo", logger): try: - markdown_files, org_files = self.get_files(repo_url, repo) + markdown_files, org_files, plaintext_files = self.get_files(repo_url, repo) + except ConnectionAbortedError as e: + logger.error(f"Github rate limit reached. Skip indexing github repo {repo_shorthand}") except Exception as e: logger.error(f"Unable to download github repo {repo_shorthand}", exc_info=True) raise e @@ -109,10 +111,9 @@ class GithubToEntries(TextToEntries): response = requests.get(repo_content_url, headers=headers, params=params) contents = response.json() - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo) - if result is not None: - return result + # Raise exception if hit rate limit + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + raise ConnectionAbortedError("Github rate limit reached") # Extract markdown files from the repository markdown_files: List[Any] = [] @@ -144,10 +145,9 @@ class GithubToEntries(TextToEntries): headers = {"Accept": "application/vnd.github.v3.raw"} response = self.session.get(file_url, headers=headers, stream=True) - # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url) - if result is not None: - return result + # Stop indexing on hitting rate limit + if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0": + raise ConnectionAbortedError("Github rate limit reached") content = "" for chunk in response.iter_content(chunk_size=2048):